changeset 1996:4a4c3e15b8c2

Add more analyzing.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 02 Jul 2018 02:26:52 +0300
parents ea6337c873c6
children 2f8506171064
files tools/fanalyze.c
diffstat 1 files changed, 134 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/tools/fanalyze.c	Sun Jul 01 23:09:34 2018 +0300
+++ b/tools/fanalyze.c	Mon Jul 02 02:26:52 2018 +0300
@@ -10,16 +10,40 @@
 #include "dmargs.h"
 #include "dmfile.h"
 
-#define SET_MAX_FILES    (8)
+#define SET_MAX_FILES    16
+#define SET_MAX_ELEMS    256
 
 
 /* Typedefs
  */
 typedef struct
 {
+    Uint8 stats[SET_MAX_ELEMS];
+    Uint8 variants, data;
+    int interest[16];
+    int interestF[16];
+} DMCompElem;
+
+
+typedef struct
+{
+    int count;
+    Uint8 value;
+} DMStatValue;
+
+
+typedef struct
+{
+    DMStatValue cv[SET_MAX_ELEMS];
+} DMStats;
+
+
+typedef struct
+{
     char *filename;
     Uint8 *data;
     size_t size; // offset, crop_start, crop_end, doCrop?
+    DMStats stats;
 } DMSourceFile;
 
 
@@ -27,6 +51,7 @@
  */
 int           nsrcFiles = 0;              // Number of source files
 DMSourceFile  srcFiles[SET_MAX_FILES];    // Source file names
+DMStats totalStats;
 
 
 /* Arguments
@@ -83,28 +108,57 @@
     {
         dmErrorMsg("Maximum number of input files exceeded (%d).\n",
             SET_MAX_FILES);
-        return FALSE;
+        return TRUE;
+    }
+}
+
+
+void dmInitStats(DMStats *stats)
+{
+    for (size_t n = 0; n < SET_MAX_ELEMS; n++)
+    {
+        stats->cv[n].count = 0;
+        stats->cv[n].value = n;
     }
 }
 
 
-#define SET_MAX_ELEMS 256
-typedef struct
+int dmCompareStatFunc(const void *va, const void *vb)
+{
+    const DMStatValue *pa = va, *pb = vb;
+    return pb->count - pa->count;
+}
+
+
+void dmSortStats(DMStats *stats)
 {
-    Uint8 counts[SET_MAX_ELEMS];
-    Uint8 variants, data;
-} DMCompElem;
+    qsort(&stats->cv, sizeof(DMStatValue), SET_MAX_ELEMS, dmCompareStatFunc);
+}
+
+
+void dmPrintStats(DMStats *stats, const int nmax, const size_t size)
+{
+    for (int n = 0; n < nmax; n++)
+    {
+        printf("$%02x (%1.2f%%), ",
+            stats->cv[n].value,
+            ((float) stats->cv[n].count * 100.0f) / (float) size);
+    }
+    printf("\n\n");
+}
 
 
 int main(int argc, char *argv[])
 {
     DMCompElem *compBuf = NULL;
-    size_t compBufSize = 0;
+    size_t compBufSize = 0, totalSize = 0;
     int res;
 
     dmInitProg("fanalyze", "File format analyzer", "0.1", NULL, NULL);
     dmVerbosity = 1;
 
+    dmInitStats(&totalStats);
+
     // Parse arguments
     if (!dmArgsProcess(argc, argv, optList, optListN,
         argHandleOpt, argHandleNonOpt, OPTH_BAILOUT))
@@ -132,6 +186,8 @@
 
         if (!compBufSize || file->size < compBufSize)
             compBufSize = file->size;
+
+        totalSize += file->size;
     }
 
     // Allocate comparision buffer
@@ -150,19 +206,32 @@
     for (int nfile = 0; nfile < nsrcFiles; nfile++)
     {
         DMSourceFile *file = &srcFiles[nfile];
+        dmInitStats(&file->stats);
+
+        for (size_t offs = 0; offs < file->size; offs++)
+        {
+            Uint8 bv = file->data[offs];
+            totalStats.cv[bv].count++;
+            file->stats.cv[bv].count++;
+        }
+
         for (size_t offs = 0; offs < compBufSize; offs++)
         {
             Uint8 data = offs < file->size ? file->data[offs] : 0;
-            compBuf[offs].counts[data]++;
+            compBuf[offs].stats[data]++;
         }
+
+        dmSortStats(&file->stats);
     }
+    dmSortStats(&totalStats);
+
 
     for (size_t offs = 0; offs < compBufSize; offs++)
     {
         DMCompElem *el = &compBuf[offs];
         for (int n = 0; n < SET_MAX_ELEMS; n++)
         {
-            if (el->counts[n] > 0)
+            if (el->stats[n] > 0)
             {
                 el->variants++;
                 el->data = n;
@@ -193,6 +262,61 @@
 
     printf("\n");
 
+    // Attempt further analysis
+    for (int nfile = 0; nfile < nsrcFiles; nfile++)
+    {
+        DMSourceFile *file = &srcFiles[nfile];
+        size_t len = file->size > compBufSize ? compBufSize : file->size;
+        for (size_t offs = 0; offs + 4 < len; offs++)
+        {
+            DMCompElem *elem = &compBuf[offs];
+
+            for (int variant = 3; variant >= 0; variant--)
+            {
+                size_t nmax = (variant < 2) ? sizeof(Uint16) : sizeof(Uint32);
+                Uint32 tmp = 0;
+
+                for (size_t n = 0; n < nmax; n++)
+                {
+                    size_t boffs = (variant & 1) ? n : nmax - n;
+
+                    tmp <<= 8;
+                    tmp |= file->data[offs + boffs];
+                }
+
+                if (file->size - tmp < 32)
+                {
+                    elem->interest[variant] += 32 - (file->size - tmp);
+                    elem->interestF[variant]++;
+                }
+            }
+        }
+    }
+
+    printf("\nMore findings:\n");
+    for (size_t offs = 0; offs + 4 < compBufSize; offs++)
+    {
+        DMCompElem *elem = &compBuf[offs];
+
+        for (int variant = 0; variant < 4; variant++)
+        if (elem->interestF[variant] > 0)
+        {
+            printf("%08" DM_PRIx_SIZE_T " | V%d : %d / %d\n",
+            offs, variant,
+            elem->interestF[variant], elem->interest[variant]);
+        }
+    }
+
+    printf("\nGlobal most used bytes:\n");
+    dmPrintStats(&totalStats, 16, totalSize);
+
+    for (int nfile = 0; nfile < nsrcFiles; nfile++)
+    {
+        DMSourceFile *file = &srcFiles[nfile];
+        printf("Most used bytes for '%s':\n", file->filename);
+        dmPrintStats(&file->stats, 16, file->size);
+    }
+
 out:
     for (int nfile = 0; nfile < nsrcFiles; nfile++)
     {