comparison tools/fanalyze.c @ 2267:3739e2ac8bb1

Add 'beta' level feature to fanalyze, for finding matching byte sequences of minimum specified length from input files.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 17 Jun 2019 07:29:06 +0300
parents 6ba4aed198e4
children f2ff364065eb
comparison
equal deleted inserted replaced
2266:ecc17b2e129e 2267:3739e2ac8bb1
8 #include "dmtool.h" 8 #include "dmtool.h"
9 #include "dmlib.h" 9 #include "dmlib.h"
10 #include "dmargs.h" 10 #include "dmargs.h"
11 #include "dmfile.h" 11 #include "dmfile.h"
12 12
13 #define SET_MAX_FILES 64 13 #define SET_MAX_FILES 64
14 #define SET_MAX_ELEMS 256 14 #define SET_MAX_ELEMS 256
15 #define SET_MAX_VALUES 64 15 #define SET_MAX_VALUES 64
16 #define SET_MAX_GREPLIST 64 16 #define SET_MAX_GREPLIST 64
17
18 #define SET_MAX_SEQUENCES 1024
19 #define SET_MAX_PLACES 1024
17 20
18 21
19 /* Typedefs 22 /* Typedefs
20 */ 23 */
21 typedef struct 24 typedef struct
44 { 47 {
45 char *filename; 48 char *filename;
46 Uint8 *data; 49 Uint8 *data;
47 size_t size; 50 size_t size;
48 DMStats stats; 51 DMStats stats;
52 BOOL analyzed;
49 } DMSourceFile; 53 } DMSourceFile;
54
55
56 typedef struct
57 {
58 DMSourceFile *file; // pointer to file struct where match was found
59 size_t offs; // offset to match in file data
60 } DMMatchPlace;
61
62
63 typedef struct
64 {
65 size_t len; // length of the matching sequence
66 Uint8 *data; // "const" pointer to data in one file, don't free()
67
68 int nfiles; // number of separate files match was found
69 int nplaces; // number of places where match was found
70 DMMatchPlace places[SET_MAX_PLACES];
71 } DMMatchSeq;
50 72
51 73
52 enum 74 enum
53 { 75 {
54 DMGV_UINT8 = 0, 76 DMGV_UINT8 = 0,
115 enum 137 enum
116 { 138 {
117 FA_ANALYZE, 139 FA_ANALYZE,
118 FA_GREP, 140 FA_GREP,
119 FA_OFFSET, 141 FA_OFFSET,
142 FA_MATCHES,
120 }; 143 };
121 144
122 145
123 /* Global variables 146 /* Global variables
124 */ 147 */
126 int nsrcFiles = 0; // Number of source files 149 int nsrcFiles = 0; // Number of source files
127 DMSourceFile srcFiles[SET_MAX_FILES]; // Source file names 150 DMSourceFile srcFiles[SET_MAX_FILES]; // Source file names
128 DMStats totalStats; 151 DMStats totalStats;
129 int nsetGrepValues = 0; 152 int nsetGrepValues = 0;
130 DMGrepValue setGrepValues[SET_MAX_VALUES]; 153 DMGrepValue setGrepValues[SET_MAX_VALUES];
154 size_t optMinMatchLen = 8;
155
156 DMMatchSeq dmSequences[SET_MAX_SEQUENCES];
157 int ndmSequences = 0;
131 158
132 159
133 /* Arguments 160 /* Arguments
134 */ 161 */
135 static const DMOptArg optList[] = 162 static const DMOptArg optList[] =
136 { 163 {
137 { 0, '?', "help", "Show this help", OPT_NONE }, 164 { 0, '?', "help", "Show this help", OPT_NONE },
138 { 1, 'v', "verbose", "Be more verbose", OPT_NONE }, 165 { 1, 'v', "verbose", "Be more verbose", OPT_NONE },
139 { 2, 'g', "grep", "Binary grep <val>[,<val2>...][:<le|be>[8|16|32]]", OPT_ARGREQ }, 166 { 2, 'g', "grep", "Binary grep <val>[,<val2>...][:<le|be>[8|16|32]]", OPT_ARGREQ },
140 { 3, 'o', "offset", "Show data in offset <offs>[,<offs2>...][:<le|be>[8|16|32][d|x]]", OPT_ARGREQ }, 167 { 3, 'o', "offset", "Show data in offset <offs>[,<offs2>...][:<le|be>[8|16|32][d|x]]", OPT_ARGREQ },
168 { 4, 'm', "match", "Find matching sequences minimum of <n> bytes long", OPT_NONE },
169 { 5, 'n', "minmatch", "Minimum match sequence length", OPT_ARGREQ },
141 }; 170 };
142 171
143 static const int optListN = sizeof(optList) / sizeof(optList[0]); 172 static const int optListN = sizeof(optList) / sizeof(optList[0]);
144 173
145 174
157 "\n" 186 "\n"
158 "Value lists for grep function can contain wildcard '?' (or '#') which\n" 187 "Value lists for grep function can contain wildcard '?' (or '#') which\n"
159 "matches any value of the specified (or inferred) type. For example:\n" 188 "matches any value of the specified (or inferred) type. For example:\n"
160 "-g 0x0f,7,5,?,5,?,? will match sequence of bytes 0f 07 05 ?? 05 ?? ??\n" 189 "-g 0x0f,7,5,?,5,?,? will match sequence of bytes 0f 07 05 ?? 05 ?? ??\n"
161 "and -g 0xe,0x1001,?,2023:le16 will match le16 value 000e 1001 ???? 07e7\n" 190 "and -g 0xe,0x1001,?,2023:le16 will match le16 value 000e 1001 ???? 07e7\n"
191 "\n"
192 "NOTICE! Matching sequences search (-m) is considered unfinished and\n"
193 "under development.\n"
162 ); 194 );
163 } 195 }
164 196
165 197
166 BOOL dmGetData(const int type, const DMSourceFile *file, const size_t offs, Uint32 *mval) 198 BOOL dmGetData(const int type, const DMSourceFile *file, const size_t offs, Uint32 *mval)
465 return argParseGrepValue(optArg, FA_GREP) == DMERR_OK; 497 return argParseGrepValue(optArg, FA_GREP) == DMERR_OK;
466 498
467 case 3: 499 case 3:
468 return argParseGrepValue(optArg, FA_OFFSET) == DMERR_OK; 500 return argParseGrepValue(optArg, FA_OFFSET) == DMERR_OK;
469 501
502 case 4:
503 setMode = FA_MATCHES;
504 break;
505
506 case 5:
507 optMinMatchLen = atoi(optArg);
508 if (optMinMatchLen < 2 || optMinMatchLen > 16*1024)
509 {
510 dmErrorMsg("Invalid minimum match length '%s'.\n",
511 optArg);
512 return FALSE;
513 }
514 return TRUE;
515
470 default: 516 default:
471 dmErrorMsg("Unknown argument '%s'.\n", currArg); 517 dmErrorMsg("Unknown argument '%s'.\n", currArg);
472 return FALSE; 518 return FALSE;
473 } 519 }
474 520
523 } 569 }
524 printf("\n\n"); 570 printf("\n\n");
525 } 571 }
526 572
527 573
574 void dmAddMatchSequence(Uint8 *data, const size_t len, DMSourceFile *file, size_t offs)
575 {
576 DMMatchSeq *seq = NULL;
577
578 // Check for existing match sequence
579 for (int n = 0; n < ndmSequences; n++)
580 {
581 DMMatchSeq *node = &dmSequences[n];
582 if (node->len == len &&
583 (node->data == data ||
584 memcmp(node->data, data, node->len) == 0))
585 {
586 seq = node;
587 break;
588 }
589 }
590
591 if (seq == NULL)
592 {
593 // No sequence found, add a new one
594 if (ndmSequences + 1 >= SET_MAX_SEQUENCES)
595 {
596 dmErrorMsg("Too many matching sequences found.\n");
597 return;
598 }
599
600 seq = &dmSequences[ndmSequences++];
601 seq->data = data;
602 seq->len = len;
603 }
604 else
605 {
606 // Check for existing
607 for (int n = 0; n < seq->nplaces; n++)
608 {
609 DMMatchPlace *place = &seq->places[n];
610 if (place->offs == offs && place->file == file)
611 return;
612 }
613 }
614
615 // Add another file + offset
616 if (seq->nplaces < SET_MAX_PLACES)
617 {
618 DMMatchPlace *place = &seq->places[seq->nplaces++];
619 place->file = file;
620 place->offs = offs;
621 }
622 else
623 return;
624
625 }
626
627
528 int main(int argc, char *argv[]) 628 int main(int argc, char *argv[])
529 { 629 {
530 DMCompElem *compBuf = NULL; 630 DMCompElem *compBuf = NULL;
531 size_t compBufSize = 0, totalSize = 0; 631 size_t compBufSize = 0, totalSize = 0;
532 int res; 632 int res;
533 633
634 memset(&dmSequences, 0, sizeof(dmSequences));
635
534 dmInitProg("fanalyze", "Simple tool for file format analysis", 636 dmInitProg("fanalyze", "Simple tool for file format analysis",
535 "0.4", NULL, NULL); 637 "0.4", NULL, NULL);
536 dmVerbosity = 1; 638 dmVerbosity = 0;
537 639
538 dmInitStats(&totalStats); 640 dmInitStats(&totalStats);
539 641
540 // Parse arguments 642 // Parse arguments
541 if (!dmArgsProcess(argc, argv, optList, optListN, 643 if (!dmArgsProcess(argc, argv, optList, optListN,
747 } 849 }
748 } 850 }
749 851
750 printf("\n"); 852 printf("\n");
751 853
854 //
752 // Attempt further analysis 855 // Attempt further analysis
856 //
753 for (int nfile = 0; nfile < nsrcFiles; nfile++) 857 for (int nfile = 0; nfile < nsrcFiles; nfile++)
754 { 858 {
755 DMSourceFile *file = &srcFiles[nfile]; 859 DMSourceFile *file = &srcFiles[nfile];
756 size_t len = file->size > compBufSize ? compBufSize : file->size; 860 size_t len = file->size > compBufSize ? compBufSize : file->size;
757 for (size_t offs = 0; offs + 4 < len; offs++) 861 for (size_t offs = 0; offs + 4 < len; offs++)
801 { 905 {
802 DMSourceFile *file = &srcFiles[nfile]; 906 DMSourceFile *file = &srcFiles[nfile];
803 printf("Most used bytes for '%s':\n", file->filename); 907 printf("Most used bytes for '%s':\n", file->filename);
804 dmPrintStats(&file->stats, 16, file->size); 908 dmPrintStats(&file->stats, 16, file->size);
805 } 909 }
910
911 }
912 else
913 if (setMode == FA_MATCHES)
914 {
915 //
916 // Attempt to find matching sequences of N+
917 //
918 dmPrint(2, "Attempting to find matching sequences of %" DM_PRIu_SIZE_T" bytes or more\n",
919 optMinMatchLen);
920
921 for (int nfile1 = 0; nfile1 < nsrcFiles; nfile1++)
922 {
923 DMSourceFile *file1 = &srcFiles[nfile1];
924
925 for (int nfile2 = 0; nfile2 < nsrcFiles; nfile2++)
926 if (nfile2 != nfile1 && !file1->analyzed)
927 {
928 DMSourceFile *file2 = &srcFiles[nfile2];
929
930 // Find longest possible matching sequence in file2, if any
931 size_t moffs1 = 0, moffs2 = 0;
932 while (moffs1 + optMinMatchLen < file1->size &&
933 moffs2 + optMinMatchLen < file2->size)
934 {
935 size_t cnt;
936 for (cnt = 0; moffs1 + cnt < file1->size && moffs2 + cnt < file2->size; cnt++)
937 {
938 if (file1->data[moffs1 + cnt] != file2->data[moffs2 + cnt])
939 break;
940 }
941
942 if (cnt >= optMinMatchLen)
943 {
944 // Match found
945 dmAddMatchSequence(file1->data + moffs1, cnt, file1, moffs1);
946 dmAddMatchSequence(file2->data + moffs2, cnt, file2, moffs2);
947
948 moffs1 += cnt;
949 }
950 else
951 {
952 moffs1++;
953 moffs2++;
954 }
955 }
956 }
957 file1->analyzed = TRUE;
958 }
959
960 //
961 // Display results
962 //
963 dmPrint(0, "Found %d matching sequence groups of %" DM_PRIu_SIZE_T " bytes minimum.\n",
964 ndmSequences, optMinMatchLen);
965
966 for (int nmatch = 0; nmatch < ndmSequences; nmatch++)
967 {
968 DMMatchSeq *seq = &dmSequences[nmatch];
969
970 printf("\nSeq of %" DM_PRIu_SIZE_T " bytes in %d places (%d files)\n",
971 seq->len, seq->nplaces, seq->nfiles);
972
973 if (dmVerbosity > 0)
974 {
975 int n = 0;
976 for (size_t offs = 0; offs < seq->len; offs++)
977 {
978 if (n == 0)
979 printf(" ");
980
981 printf("%02x%s",
982 seq->data[offs],
983 offs + 1 < seq->len ? " " : "");
984
985 if (++n >= 16)
986 {
987 printf("\n");
988 n = 0;
989 }
990 }
991 if (n > 0)
992 printf("\n");
993 }
994
995 for (int nplace = 0; nplace < seq->nplaces; nplace++)
996 {
997 DMMatchPlace *place = &seq->places[nplace];
998 printf(" %08" DM_PRIx_SIZE_T ": %s\n",
999 place->offs,
1000 place->file->filename);
1001
1002 }
1003 }
806 } 1004 }
807 else 1005 else
808 { 1006 {
809 dmErrorMsg("Invalid operating mode?\n"); 1007 dmErrorMsg("Invalid operating mode?\n");
810 } 1008 }