# HG changeset patch # User Matti Hamalainen # Date 1560745746 -10800 # Node ID 3739e2ac8bb1ae98513202b0454b2adbaff810b7 # Parent ecc17b2e129e42fd2301a1991c505716e3ccfde1 Add 'beta' level feature to fanalyze, for finding matching byte sequences of minimum specified length from input files. diff -r ecc17b2e129e -r 3739e2ac8bb1 tools/fanalyze.c --- a/tools/fanalyze.c Mon Jun 17 05:31:38 2019 +0300 +++ b/tools/fanalyze.c Mon Jun 17 07:29:06 2019 +0300 @@ -10,10 +10,13 @@ #include "dmargs.h" #include "dmfile.h" -#define SET_MAX_FILES 64 -#define SET_MAX_ELEMS 256 -#define SET_MAX_VALUES 64 -#define SET_MAX_GREPLIST 64 +#define SET_MAX_FILES 64 +#define SET_MAX_ELEMS 256 +#define SET_MAX_VALUES 64 +#define SET_MAX_GREPLIST 64 + +#define SET_MAX_SEQUENCES 1024 +#define SET_MAX_PLACES 1024 /* Typedefs @@ -46,9 +49,28 @@ Uint8 *data; size_t size; DMStats stats; + BOOL analyzed; } DMSourceFile; +typedef struct +{ + DMSourceFile *file; // pointer to file struct where match was found + size_t offs; // offset to match in file data +} DMMatchPlace; + + +typedef struct +{ + size_t len; // length of the matching sequence + Uint8 *data; // "const" pointer to data in one file, don't free() + + int nfiles; // number of separate files match was found + int nplaces; // number of places where match was found + DMMatchPlace places[SET_MAX_PLACES]; +} DMMatchSeq; + + enum { DMGV_UINT8 = 0, @@ -117,6 +139,7 @@ FA_ANALYZE, FA_GREP, FA_OFFSET, + FA_MATCHES, }; @@ -128,6 +151,10 @@ DMStats totalStats; int nsetGrepValues = 0; DMGrepValue setGrepValues[SET_MAX_VALUES]; +size_t optMinMatchLen = 8; + +DMMatchSeq dmSequences[SET_MAX_SEQUENCES]; +int ndmSequences = 0; /* Arguments @@ -138,6 +165,8 @@ { 1, 'v', "verbose", "Be more verbose", OPT_NONE }, { 2, 'g', "grep", "Binary grep [,...][:[8|16|32]]", OPT_ARGREQ }, { 3, 'o', "offset", "Show data in offset [,...][:[8|16|32][d|x]]", OPT_ARGREQ }, + { 4, 'm', "match", "Find matching sequences minimum of bytes long", OPT_NONE }, + { 5, 'n', "minmatch", "Minimum match sequence length", OPT_ARGREQ }, }; static const int optListN = sizeof(optList) / sizeof(optList[0]); @@ -159,6 +188,9 @@ "matches any value of the specified (or inferred) type. For example:\n" "-g 0x0f,7,5,?,5,?,? will match sequence of bytes 0f 07 05 ?? 05 ?? ??\n" "and -g 0xe,0x1001,?,2023:le16 will match le16 value 000e 1001 ???? 07e7\n" + "\n" + "NOTICE! Matching sequences search (-m) is considered unfinished and\n" + "under development.\n" ); } @@ -467,6 +499,20 @@ case 3: return argParseGrepValue(optArg, FA_OFFSET) == DMERR_OK; + case 4: + setMode = FA_MATCHES; + break; + + case 5: + optMinMatchLen = atoi(optArg); + if (optMinMatchLen < 2 || optMinMatchLen > 16*1024) + { + dmErrorMsg("Invalid minimum match length '%s'.\n", + optArg); + return FALSE; + } + return TRUE; + default: dmErrorMsg("Unknown argument '%s'.\n", currArg); return FALSE; @@ -525,15 +571,71 @@ } +void dmAddMatchSequence(Uint8 *data, const size_t len, DMSourceFile *file, size_t offs) +{ + DMMatchSeq *seq = NULL; + + // Check for existing match sequence + for (int n = 0; n < ndmSequences; n++) + { + DMMatchSeq *node = &dmSequences[n]; + if (node->len == len && + (node->data == data || + memcmp(node->data, data, node->len) == 0)) + { + seq = node; + break; + } + } + + if (seq == NULL) + { + // No sequence found, add a new one + if (ndmSequences + 1 >= SET_MAX_SEQUENCES) + { + dmErrorMsg("Too many matching sequences found.\n"); + return; + } + + seq = &dmSequences[ndmSequences++]; + seq->data = data; + seq->len = len; + } + else + { + // Check for existing + for (int n = 0; n < seq->nplaces; n++) + { + DMMatchPlace *place = &seq->places[n]; + if (place->offs == offs && place->file == file) + return; + } + } + + // Add another file + offset + if (seq->nplaces < SET_MAX_PLACES) + { + DMMatchPlace *place = &seq->places[seq->nplaces++]; + place->file = file; + place->offs = offs; + } + else + return; + +} + + int main(int argc, char *argv[]) { DMCompElem *compBuf = NULL; size_t compBufSize = 0, totalSize = 0; int res; + memset(&dmSequences, 0, sizeof(dmSequences)); + dmInitProg("fanalyze", "Simple tool for file format analysis", "0.4", NULL, NULL); - dmVerbosity = 1; + dmVerbosity = 0; dmInitStats(&totalStats); @@ -749,7 +851,9 @@ printf("\n"); + // // Attempt further analysis + // for (int nfile = 0; nfile < nsrcFiles; nfile++) { DMSourceFile *file = &srcFiles[nfile]; @@ -803,6 +907,100 @@ printf("Most used bytes for '%s':\n", file->filename); dmPrintStats(&file->stats, 16, file->size); } + + } + else + if (setMode == FA_MATCHES) + { + // + // Attempt to find matching sequences of N+ + // + dmPrint(2, "Attempting to find matching sequences of %" DM_PRIu_SIZE_T" bytes or more\n", + optMinMatchLen); + + for (int nfile1 = 0; nfile1 < nsrcFiles; nfile1++) + { + DMSourceFile *file1 = &srcFiles[nfile1]; + + for (int nfile2 = 0; nfile2 < nsrcFiles; nfile2++) + if (nfile2 != nfile1 && !file1->analyzed) + { + DMSourceFile *file2 = &srcFiles[nfile2]; + + // Find longest possible matching sequence in file2, if any + size_t moffs1 = 0, moffs2 = 0; + while (moffs1 + optMinMatchLen < file1->size && + moffs2 + optMinMatchLen < file2->size) + { + size_t cnt; + for (cnt = 0; moffs1 + cnt < file1->size && moffs2 + cnt < file2->size; cnt++) + { + if (file1->data[moffs1 + cnt] != file2->data[moffs2 + cnt]) + break; + } + + if (cnt >= optMinMatchLen) + { + // Match found + dmAddMatchSequence(file1->data + moffs1, cnt, file1, moffs1); + dmAddMatchSequence(file2->data + moffs2, cnt, file2, moffs2); + + moffs1 += cnt; + } + else + { + moffs1++; + moffs2++; + } + } + } + file1->analyzed = TRUE; + } + + // + // Display results + // + dmPrint(0, "Found %d matching sequence groups of %" DM_PRIu_SIZE_T " bytes minimum.\n", + ndmSequences, optMinMatchLen); + + for (int nmatch = 0; nmatch < ndmSequences; nmatch++) + { + DMMatchSeq *seq = &dmSequences[nmatch]; + + printf("\nSeq of %" DM_PRIu_SIZE_T " bytes in %d places (%d files)\n", + seq->len, seq->nplaces, seq->nfiles); + + if (dmVerbosity > 0) + { + int n = 0; + for (size_t offs = 0; offs < seq->len; offs++) + { + if (n == 0) + printf(" "); + + printf("%02x%s", + seq->data[offs], + offs + 1 < seq->len ? " " : ""); + + if (++n >= 16) + { + printf("\n"); + n = 0; + } + } + if (n > 0) + printf("\n"); + } + + for (int nplace = 0; nplace < seq->nplaces; nplace++) + { + DMMatchPlace *place = &seq->places[nplace]; + printf(" %08" DM_PRIx_SIZE_T ": %s\n", + place->offs, + place->file->filename); + + } + } } else {