Mercurial > hg > dmlib
comparison tools/fanalyze.c @ 2267:3739e2ac8bb1
Add 'beta' level feature to fanalyze, for finding matching byte sequences of minimum
specified length from input files.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Mon, 17 Jun 2019 07:29:06 +0300 |
parents | 6ba4aed198e4 |
children | f2ff364065eb |
comparison
equal
deleted
inserted
replaced
2266:ecc17b2e129e | 2267:3739e2ac8bb1 |
---|---|
8 #include "dmtool.h" | 8 #include "dmtool.h" |
9 #include "dmlib.h" | 9 #include "dmlib.h" |
10 #include "dmargs.h" | 10 #include "dmargs.h" |
11 #include "dmfile.h" | 11 #include "dmfile.h" |
12 | 12 |
13 #define SET_MAX_FILES 64 | 13 #define SET_MAX_FILES 64 |
14 #define SET_MAX_ELEMS 256 | 14 #define SET_MAX_ELEMS 256 |
15 #define SET_MAX_VALUES 64 | 15 #define SET_MAX_VALUES 64 |
16 #define SET_MAX_GREPLIST 64 | 16 #define SET_MAX_GREPLIST 64 |
17 | |
18 #define SET_MAX_SEQUENCES 1024 | |
19 #define SET_MAX_PLACES 1024 | |
17 | 20 |
18 | 21 |
19 /* Typedefs | 22 /* Typedefs |
20 */ | 23 */ |
21 typedef struct | 24 typedef struct |
44 { | 47 { |
45 char *filename; | 48 char *filename; |
46 Uint8 *data; | 49 Uint8 *data; |
47 size_t size; | 50 size_t size; |
48 DMStats stats; | 51 DMStats stats; |
52 BOOL analyzed; | |
49 } DMSourceFile; | 53 } DMSourceFile; |
54 | |
55 | |
56 typedef struct | |
57 { | |
58 DMSourceFile *file; // pointer to file struct where match was found | |
59 size_t offs; // offset to match in file data | |
60 } DMMatchPlace; | |
61 | |
62 | |
63 typedef struct | |
64 { | |
65 size_t len; // length of the matching sequence | |
66 Uint8 *data; // "const" pointer to data in one file, don't free() | |
67 | |
68 int nfiles; // number of separate files match was found | |
69 int nplaces; // number of places where match was found | |
70 DMMatchPlace places[SET_MAX_PLACES]; | |
71 } DMMatchSeq; | |
50 | 72 |
51 | 73 |
52 enum | 74 enum |
53 { | 75 { |
54 DMGV_UINT8 = 0, | 76 DMGV_UINT8 = 0, |
115 enum | 137 enum |
116 { | 138 { |
117 FA_ANALYZE, | 139 FA_ANALYZE, |
118 FA_GREP, | 140 FA_GREP, |
119 FA_OFFSET, | 141 FA_OFFSET, |
142 FA_MATCHES, | |
120 }; | 143 }; |
121 | 144 |
122 | 145 |
123 /* Global variables | 146 /* Global variables |
124 */ | 147 */ |
126 int nsrcFiles = 0; // Number of source files | 149 int nsrcFiles = 0; // Number of source files |
127 DMSourceFile srcFiles[SET_MAX_FILES]; // Source file names | 150 DMSourceFile srcFiles[SET_MAX_FILES]; // Source file names |
128 DMStats totalStats; | 151 DMStats totalStats; |
129 int nsetGrepValues = 0; | 152 int nsetGrepValues = 0; |
130 DMGrepValue setGrepValues[SET_MAX_VALUES]; | 153 DMGrepValue setGrepValues[SET_MAX_VALUES]; |
154 size_t optMinMatchLen = 8; | |
155 | |
156 DMMatchSeq dmSequences[SET_MAX_SEQUENCES]; | |
157 int ndmSequences = 0; | |
131 | 158 |
132 | 159 |
133 /* Arguments | 160 /* Arguments |
134 */ | 161 */ |
135 static const DMOptArg optList[] = | 162 static const DMOptArg optList[] = |
136 { | 163 { |
137 { 0, '?', "help", "Show this help", OPT_NONE }, | 164 { 0, '?', "help", "Show this help", OPT_NONE }, |
138 { 1, 'v', "verbose", "Be more verbose", OPT_NONE }, | 165 { 1, 'v', "verbose", "Be more verbose", OPT_NONE }, |
139 { 2, 'g', "grep", "Binary grep <val>[,<val2>...][:<le|be>[8|16|32]]", OPT_ARGREQ }, | 166 { 2, 'g', "grep", "Binary grep <val>[,<val2>...][:<le|be>[8|16|32]]", OPT_ARGREQ }, |
140 { 3, 'o', "offset", "Show data in offset <offs>[,<offs2>...][:<le|be>[8|16|32][d|x]]", OPT_ARGREQ }, | 167 { 3, 'o', "offset", "Show data in offset <offs>[,<offs2>...][:<le|be>[8|16|32][d|x]]", OPT_ARGREQ }, |
168 { 4, 'm', "match", "Find matching sequences minimum of <n> bytes long", OPT_NONE }, | |
169 { 5, 'n', "minmatch", "Minimum match sequence length", OPT_ARGREQ }, | |
141 }; | 170 }; |
142 | 171 |
143 static const int optListN = sizeof(optList) / sizeof(optList[0]); | 172 static const int optListN = sizeof(optList) / sizeof(optList[0]); |
144 | 173 |
145 | 174 |
157 "\n" | 186 "\n" |
158 "Value lists for grep function can contain wildcard '?' (or '#') which\n" | 187 "Value lists for grep function can contain wildcard '?' (or '#') which\n" |
159 "matches any value of the specified (or inferred) type. For example:\n" | 188 "matches any value of the specified (or inferred) type. For example:\n" |
160 "-g 0x0f,7,5,?,5,?,? will match sequence of bytes 0f 07 05 ?? 05 ?? ??\n" | 189 "-g 0x0f,7,5,?,5,?,? will match sequence of bytes 0f 07 05 ?? 05 ?? ??\n" |
161 "and -g 0xe,0x1001,?,2023:le16 will match le16 value 000e 1001 ???? 07e7\n" | 190 "and -g 0xe,0x1001,?,2023:le16 will match le16 value 000e 1001 ???? 07e7\n" |
191 "\n" | |
192 "NOTICE! Matching sequences search (-m) is considered unfinished and\n" | |
193 "under development.\n" | |
162 ); | 194 ); |
163 } | 195 } |
164 | 196 |
165 | 197 |
166 BOOL dmGetData(const int type, const DMSourceFile *file, const size_t offs, Uint32 *mval) | 198 BOOL dmGetData(const int type, const DMSourceFile *file, const size_t offs, Uint32 *mval) |
465 return argParseGrepValue(optArg, FA_GREP) == DMERR_OK; | 497 return argParseGrepValue(optArg, FA_GREP) == DMERR_OK; |
466 | 498 |
467 case 3: | 499 case 3: |
468 return argParseGrepValue(optArg, FA_OFFSET) == DMERR_OK; | 500 return argParseGrepValue(optArg, FA_OFFSET) == DMERR_OK; |
469 | 501 |
502 case 4: | |
503 setMode = FA_MATCHES; | |
504 break; | |
505 | |
506 case 5: | |
507 optMinMatchLen = atoi(optArg); | |
508 if (optMinMatchLen < 2 || optMinMatchLen > 16*1024) | |
509 { | |
510 dmErrorMsg("Invalid minimum match length '%s'.\n", | |
511 optArg); | |
512 return FALSE; | |
513 } | |
514 return TRUE; | |
515 | |
470 default: | 516 default: |
471 dmErrorMsg("Unknown argument '%s'.\n", currArg); | 517 dmErrorMsg("Unknown argument '%s'.\n", currArg); |
472 return FALSE; | 518 return FALSE; |
473 } | 519 } |
474 | 520 |
523 } | 569 } |
524 printf("\n\n"); | 570 printf("\n\n"); |
525 } | 571 } |
526 | 572 |
527 | 573 |
574 void dmAddMatchSequence(Uint8 *data, const size_t len, DMSourceFile *file, size_t offs) | |
575 { | |
576 DMMatchSeq *seq = NULL; | |
577 | |
578 // Check for existing match sequence | |
579 for (int n = 0; n < ndmSequences; n++) | |
580 { | |
581 DMMatchSeq *node = &dmSequences[n]; | |
582 if (node->len == len && | |
583 (node->data == data || | |
584 memcmp(node->data, data, node->len) == 0)) | |
585 { | |
586 seq = node; | |
587 break; | |
588 } | |
589 } | |
590 | |
591 if (seq == NULL) | |
592 { | |
593 // No sequence found, add a new one | |
594 if (ndmSequences + 1 >= SET_MAX_SEQUENCES) | |
595 { | |
596 dmErrorMsg("Too many matching sequences found.\n"); | |
597 return; | |
598 } | |
599 | |
600 seq = &dmSequences[ndmSequences++]; | |
601 seq->data = data; | |
602 seq->len = len; | |
603 } | |
604 else | |
605 { | |
606 // Check for existing | |
607 for (int n = 0; n < seq->nplaces; n++) | |
608 { | |
609 DMMatchPlace *place = &seq->places[n]; | |
610 if (place->offs == offs && place->file == file) | |
611 return; | |
612 } | |
613 } | |
614 | |
615 // Add another file + offset | |
616 if (seq->nplaces < SET_MAX_PLACES) | |
617 { | |
618 DMMatchPlace *place = &seq->places[seq->nplaces++]; | |
619 place->file = file; | |
620 place->offs = offs; | |
621 } | |
622 else | |
623 return; | |
624 | |
625 } | |
626 | |
627 | |
528 int main(int argc, char *argv[]) | 628 int main(int argc, char *argv[]) |
529 { | 629 { |
530 DMCompElem *compBuf = NULL; | 630 DMCompElem *compBuf = NULL; |
531 size_t compBufSize = 0, totalSize = 0; | 631 size_t compBufSize = 0, totalSize = 0; |
532 int res; | 632 int res; |
533 | 633 |
634 memset(&dmSequences, 0, sizeof(dmSequences)); | |
635 | |
534 dmInitProg("fanalyze", "Simple tool for file format analysis", | 636 dmInitProg("fanalyze", "Simple tool for file format analysis", |
535 "0.4", NULL, NULL); | 637 "0.4", NULL, NULL); |
536 dmVerbosity = 1; | 638 dmVerbosity = 0; |
537 | 639 |
538 dmInitStats(&totalStats); | 640 dmInitStats(&totalStats); |
539 | 641 |
540 // Parse arguments | 642 // Parse arguments |
541 if (!dmArgsProcess(argc, argv, optList, optListN, | 643 if (!dmArgsProcess(argc, argv, optList, optListN, |
747 } | 849 } |
748 } | 850 } |
749 | 851 |
750 printf("\n"); | 852 printf("\n"); |
751 | 853 |
854 // | |
752 // Attempt further analysis | 855 // Attempt further analysis |
856 // | |
753 for (int nfile = 0; nfile < nsrcFiles; nfile++) | 857 for (int nfile = 0; nfile < nsrcFiles; nfile++) |
754 { | 858 { |
755 DMSourceFile *file = &srcFiles[nfile]; | 859 DMSourceFile *file = &srcFiles[nfile]; |
756 size_t len = file->size > compBufSize ? compBufSize : file->size; | 860 size_t len = file->size > compBufSize ? compBufSize : file->size; |
757 for (size_t offs = 0; offs + 4 < len; offs++) | 861 for (size_t offs = 0; offs + 4 < len; offs++) |
801 { | 905 { |
802 DMSourceFile *file = &srcFiles[nfile]; | 906 DMSourceFile *file = &srcFiles[nfile]; |
803 printf("Most used bytes for '%s':\n", file->filename); | 907 printf("Most used bytes for '%s':\n", file->filename); |
804 dmPrintStats(&file->stats, 16, file->size); | 908 dmPrintStats(&file->stats, 16, file->size); |
805 } | 909 } |
910 | |
911 } | |
912 else | |
913 if (setMode == FA_MATCHES) | |
914 { | |
915 // | |
916 // Attempt to find matching sequences of N+ | |
917 // | |
918 dmPrint(2, "Attempting to find matching sequences of %" DM_PRIu_SIZE_T" bytes or more\n", | |
919 optMinMatchLen); | |
920 | |
921 for (int nfile1 = 0; nfile1 < nsrcFiles; nfile1++) | |
922 { | |
923 DMSourceFile *file1 = &srcFiles[nfile1]; | |
924 | |
925 for (int nfile2 = 0; nfile2 < nsrcFiles; nfile2++) | |
926 if (nfile2 != nfile1 && !file1->analyzed) | |
927 { | |
928 DMSourceFile *file2 = &srcFiles[nfile2]; | |
929 | |
930 // Find longest possible matching sequence in file2, if any | |
931 size_t moffs1 = 0, moffs2 = 0; | |
932 while (moffs1 + optMinMatchLen < file1->size && | |
933 moffs2 + optMinMatchLen < file2->size) | |
934 { | |
935 size_t cnt; | |
936 for (cnt = 0; moffs1 + cnt < file1->size && moffs2 + cnt < file2->size; cnt++) | |
937 { | |
938 if (file1->data[moffs1 + cnt] != file2->data[moffs2 + cnt]) | |
939 break; | |
940 } | |
941 | |
942 if (cnt >= optMinMatchLen) | |
943 { | |
944 // Match found | |
945 dmAddMatchSequence(file1->data + moffs1, cnt, file1, moffs1); | |
946 dmAddMatchSequence(file2->data + moffs2, cnt, file2, moffs2); | |
947 | |
948 moffs1 += cnt; | |
949 } | |
950 else | |
951 { | |
952 moffs1++; | |
953 moffs2++; | |
954 } | |
955 } | |
956 } | |
957 file1->analyzed = TRUE; | |
958 } | |
959 | |
960 // | |
961 // Display results | |
962 // | |
963 dmPrint(0, "Found %d matching sequence groups of %" DM_PRIu_SIZE_T " bytes minimum.\n", | |
964 ndmSequences, optMinMatchLen); | |
965 | |
966 for (int nmatch = 0; nmatch < ndmSequences; nmatch++) | |
967 { | |
968 DMMatchSeq *seq = &dmSequences[nmatch]; | |
969 | |
970 printf("\nSeq of %" DM_PRIu_SIZE_T " bytes in %d places (%d files)\n", | |
971 seq->len, seq->nplaces, seq->nfiles); | |
972 | |
973 if (dmVerbosity > 0) | |
974 { | |
975 int n = 0; | |
976 for (size_t offs = 0; offs < seq->len; offs++) | |
977 { | |
978 if (n == 0) | |
979 printf(" "); | |
980 | |
981 printf("%02x%s", | |
982 seq->data[offs], | |
983 offs + 1 < seq->len ? " " : ""); | |
984 | |
985 if (++n >= 16) | |
986 { | |
987 printf("\n"); | |
988 n = 0; | |
989 } | |
990 } | |
991 if (n > 0) | |
992 printf("\n"); | |
993 } | |
994 | |
995 for (int nplace = 0; nplace < seq->nplaces; nplace++) | |
996 { | |
997 DMMatchPlace *place = &seq->places[nplace]; | |
998 printf(" %08" DM_PRIx_SIZE_T ": %s\n", | |
999 place->offs, | |
1000 place->file->filename); | |
1001 | |
1002 } | |
1003 } | |
806 } | 1004 } |
807 else | 1005 else |
808 { | 1006 { |
809 dmErrorMsg("Invalid operating mode?\n"); | 1007 dmErrorMsg("Invalid operating mode?\n"); |
810 } | 1008 } |