changeset 2267:3739e2ac8bb1

Add 'beta' level feature to fanalyze, for finding matching byte sequences of minimum specified length from input files.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 17 Jun 2019 07:29:06 +0300
parents ecc17b2e129e
children f2ff364065eb
files tools/fanalyze.c
diffstat 1 files changed, 203 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/tools/fanalyze.c	Mon Jun 17 05:31:38 2019 +0300
+++ b/tools/fanalyze.c	Mon Jun 17 07:29:06 2019 +0300
@@ -10,10 +10,13 @@
 #include "dmargs.h"
 #include "dmfile.h"
 
-#define SET_MAX_FILES    64
-#define SET_MAX_ELEMS    256
-#define SET_MAX_VALUES   64
-#define SET_MAX_GREPLIST 64
+#define SET_MAX_FILES      64
+#define SET_MAX_ELEMS      256
+#define SET_MAX_VALUES     64
+#define SET_MAX_GREPLIST   64
+
+#define SET_MAX_SEQUENCES  1024
+#define SET_MAX_PLACES     1024
 
 
 /* Typedefs
@@ -46,9 +49,28 @@
     Uint8 *data;
     size_t size;
     DMStats stats;
+    BOOL analyzed;
 } DMSourceFile;
 
 
+typedef struct
+{
+    DMSourceFile *file;  // pointer to file struct where match was found
+    size_t offs;         // offset to match in file data
+} DMMatchPlace;
+
+
+typedef struct
+{
+    size_t len;     // length of the matching sequence
+    Uint8 *data;    // "const" pointer to data in one file, don't free()
+
+    int nfiles;     // number of separate files match was found
+    int nplaces;    // number of places where match was found
+    DMMatchPlace places[SET_MAX_PLACES];
+} DMMatchSeq;
+
+
 enum
 {
     DMGV_UINT8 = 0,
@@ -117,6 +139,7 @@
     FA_ANALYZE,
     FA_GREP,
     FA_OFFSET,
+    FA_MATCHES,
 };
 
 
@@ -128,6 +151,10 @@
 DMStats        totalStats;
 int            nsetGrepValues = 0;
 DMGrepValue    setGrepValues[SET_MAX_VALUES];
+size_t         optMinMatchLen = 8;
+
+DMMatchSeq dmSequences[SET_MAX_SEQUENCES];
+int ndmSequences = 0;
 
 
 /* Arguments
@@ -138,6 +165,8 @@
     {  1, 'v', "verbose",     "Be more verbose", OPT_NONE },
     {  2, 'g', "grep",        "Binary grep <val>[,<val2>...][:<le|be>[8|16|32]]", OPT_ARGREQ },
     {  3, 'o', "offset",      "Show data in offset <offs>[,<offs2>...][:<le|be>[8|16|32][d|x]]", OPT_ARGREQ },
+    {  4, 'm', "match",       "Find matching sequences minimum of <n> bytes long", OPT_NONE },
+    {  5, 'n', "minmatch",    "Minimum match sequence length", OPT_ARGREQ },
 };
 
 static const int optListN = sizeof(optList) / sizeof(optList[0]);
@@ -159,6 +188,9 @@
     "matches any value of the specified (or inferred) type. For example:\n"
     "-g 0x0f,7,5,?,5,?,? will match sequence of bytes 0f 07 05 ?? 05 ?? ??\n"
     "and -g 0xe,0x1001,?,2023:le16 will match le16 value 000e 1001 ???? 07e7\n"
+    "\n"
+    "NOTICE! Matching sequences search (-m) is considered unfinished and\n"
+    "under development.\n"
     );
 }
 
@@ -467,6 +499,20 @@
         case 3:
             return argParseGrepValue(optArg, FA_OFFSET) == DMERR_OK;
 
+        case 4:
+            setMode = FA_MATCHES;
+            break;
+
+        case 5:
+            optMinMatchLen = atoi(optArg);
+            if (optMinMatchLen < 2 || optMinMatchLen > 16*1024)
+            {
+                dmErrorMsg("Invalid minimum match length '%s'.\n",
+                    optArg);
+                return FALSE;
+            }
+            return TRUE;
+
         default:
             dmErrorMsg("Unknown argument '%s'.\n", currArg);
             return FALSE;
@@ -525,15 +571,71 @@
 }
 
 
+void dmAddMatchSequence(Uint8 *data, const size_t len, DMSourceFile *file, size_t offs)
+{
+    DMMatchSeq *seq = NULL;
+
+    // Check for existing match sequence
+    for (int n = 0; n < ndmSequences; n++)
+    {
+        DMMatchSeq *node = &dmSequences[n];
+        if (node->len == len &&
+            (node->data == data ||
+            memcmp(node->data, data, node->len) == 0))
+        {
+            seq = node;
+            break;
+        }
+    }
+
+    if (seq == NULL)
+    {
+        // No sequence found, add a new one
+        if (ndmSequences + 1 >= SET_MAX_SEQUENCES)
+        {
+            dmErrorMsg("Too many matching sequences found.\n");
+            return;
+        }
+
+        seq = &dmSequences[ndmSequences++];
+        seq->data = data;
+        seq->len = len;
+    }
+    else
+    {
+        // Check for existing
+        for (int n = 0; n < seq->nplaces; n++)
+        {
+            DMMatchPlace *place = &seq->places[n];
+            if (place->offs == offs && place->file == file)
+                return;
+        }
+    }
+
+    // Add another file + offset
+    if (seq->nplaces < SET_MAX_PLACES)
+    {
+        DMMatchPlace *place = &seq->places[seq->nplaces++];
+        place->file = file;
+        place->offs = offs;
+    }
+    else
+        return;
+
+}
+
+
 int main(int argc, char *argv[])
 {
     DMCompElem *compBuf = NULL;
     size_t compBufSize = 0, totalSize = 0;
     int res;
 
+    memset(&dmSequences, 0, sizeof(dmSequences));
+
     dmInitProg("fanalyze", "Simple tool for file format analysis",
         "0.4", NULL, NULL);
-    dmVerbosity = 1;
+    dmVerbosity = 0;
 
     dmInitStats(&totalStats);
 
@@ -749,7 +851,9 @@
 
         printf("\n");
 
+        //
         // Attempt further analysis
+        //
         for (int nfile = 0; nfile < nsrcFiles; nfile++)
         {
             DMSourceFile *file = &srcFiles[nfile];
@@ -803,6 +907,100 @@
             printf("Most used bytes for '%s':\n", file->filename);
             dmPrintStats(&file->stats, 16, file->size);
         }
+
+    }
+    else
+    if (setMode == FA_MATCHES)
+    {
+        //
+        // Attempt to find matching sequences of N+
+        //
+        dmPrint(2, "Attempting to find matching sequences of %" DM_PRIu_SIZE_T" bytes or more\n",
+            optMinMatchLen);
+
+        for (int nfile1 = 0; nfile1 < nsrcFiles; nfile1++)
+        {
+            DMSourceFile *file1 = &srcFiles[nfile1];
+
+            for (int nfile2 = 0; nfile2 < nsrcFiles; nfile2++)
+            if (nfile2 != nfile1 && !file1->analyzed)
+            {
+                DMSourceFile *file2 = &srcFiles[nfile2];
+
+                // Find longest possible matching sequence in file2, if any
+                size_t moffs1 = 0, moffs2 = 0;
+                while (moffs1 + optMinMatchLen < file1->size &&
+                       moffs2 + optMinMatchLen < file2->size)
+                {
+                    size_t cnt;
+                    for (cnt = 0; moffs1 + cnt < file1->size && moffs2 + cnt < file2->size; cnt++)
+                    {
+                        if (file1->data[moffs1 + cnt] != file2->data[moffs2 + cnt])
+                            break;
+                    }
+
+                    if (cnt >= optMinMatchLen)
+                    {
+                        // Match found
+                        dmAddMatchSequence(file1->data + moffs1, cnt, file1, moffs1);
+                        dmAddMatchSequence(file2->data + moffs2, cnt, file2, moffs2);
+
+                        moffs1 += cnt;
+                    }
+                    else
+                    {
+                        moffs1++;
+                        moffs2++;
+                    }
+                }
+            }
+            file1->analyzed = TRUE;
+        }
+
+        //
+        // Display results
+        //
+        dmPrint(0, "Found %d matching sequence groups of %" DM_PRIu_SIZE_T " bytes minimum.\n",
+            ndmSequences, optMinMatchLen);
+
+        for (int nmatch = 0; nmatch < ndmSequences; nmatch++)
+        {
+            DMMatchSeq *seq = &dmSequences[nmatch];
+
+            printf("\nSeq of %" DM_PRIu_SIZE_T " bytes in %d places (%d files)\n",
+                seq->len, seq->nplaces, seq->nfiles);
+
+            if (dmVerbosity > 0)
+            {
+                int n = 0;
+                for (size_t offs = 0; offs < seq->len; offs++)
+                {
+                    if (n == 0)
+                        printf("    ");
+
+                    printf("%02x%s",
+                        seq->data[offs],
+                        offs + 1 < seq->len ? " " : "");
+
+                    if (++n >= 16)
+                    {
+                        printf("\n");
+                        n = 0;
+                    }
+                }
+                if (n > 0)
+                    printf("\n");
+            }
+
+            for (int nplace = 0; nplace < seq->nplaces; nplace++)
+            {
+                DMMatchPlace *place = &seq->places[nplace];
+                printf("    %08" DM_PRIx_SIZE_T ": %s\n",
+                    place->offs,
+                    place->file->filename);
+
+            }
+        }
     }
     else
     {