view tools/fanalyze.c @ 2032:5fe25336a474

Bump fanalyze version.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 27 Nov 2018 12:22:16 +0200
parents 8a9ef75fd3cd
children 45ad06bb60c4
line wrap: on
line source

/*
 * Fanalyze - Analyze similarities between multiple files
 * Programmed and designed by Matti 'ccr' Hamalainen
 * (C) Copyright 2018 Tecnic Software productions (TNSP)
 *
 * Please read file 'COPYING' for information on license and distribution.
 */
#include "dmtool.h"
#include "dmlib.h"
#include "dmargs.h"
#include "dmfile.h"

#define SET_MAX_FILES    64
#define SET_MAX_ELEMS    256
#define SET_MAX_VALUES   32


/* Typedefs
 */
typedef struct
{
    Uint8 stats[SET_MAX_ELEMS];
    Uint8 variants, data;
    int interest[16];
    int interestF[16];
} DMCompElem;


typedef struct
{
    int count;
    Uint8 value;
} DMStatValue;


typedef struct
{
    DMStatValue cv[SET_MAX_ELEMS];
} DMStats;


typedef struct
{
    char *filename;
    Uint8 *data;
    size_t size; // offset, crop_start, crop_end, doCrop?
    DMStats stats;
} DMSourceFile;


enum
{
    DMGV_uint8 = 0,
    DMGV_uint16_le,
    DMGV_uint16_be,
    DMGV_uint32_le,
    DMGV_uint32_be,

    DMGV_last
};


typedef struct
{
    char *name;
    uint32_t nmax;
    unsigned int bsize;
} DMGrepDef;


static const DMGrepDef dmGrepTypes[DMGV_last] =
{
    { "8bit (byte)"      , (1UL <<  8) - 1, 1 },
    { "16bit (word) LE"  , (1UL << 16) - 1, 2 },
    { "16bit (word) BE"  , (1UL << 16) - 1, 2 },
    { "32bit (word) LE"  , (1UL << 32) - 1, 4 },
    { "32bit (word) BE"  , (1UL << 32) - 1, 4 },
};


typedef struct
{
    int type;
    uint32_t value;
} DMGrepValue;


/* Global variables
 */
int            nsrcFiles = 0;              // Number of source files
DMSourceFile   srcFiles[SET_MAX_FILES];    // Source file names
DMStats        totalStats;
int            nsetGrepValues = 0;
DMGrepValue    setGrepValues[SET_MAX_VALUES];


/* Arguments
 */
static const DMOptArg optList[] =
{
    {  0, '?', "help",        "Show this help", OPT_NONE },
    {  1, 'v', "verbose",     "Be more verbose", OPT_NONE },
    {  2, 'g', "grep",        "Binary grep mode <val>[,<le|be>[8|16|32]]", OPT_ARGREQ },
};

static const int optListN = sizeof(optList) / sizeof(optList[0]);


void argShowHelp()
{
    dmPrintBanner(stdout, dmProgName, "[options] <input file #1> <input file #2> [...]");
    dmArgsPrintHelp(stdout, optList, optListN, 0);
}


int argParseGrepValue(const char *arg)
{
    const char *sep = strchr(arg, ',');
    char *vspec, *vstr;
    int vtype = -1, ret = DMERR_OK;
    uint32_t vval;

    // Do we have spec?
    if (sep != NULL)
    {
        vspec = dm_strdup_trim(sep + 1, DM_TRIM_BOTH);
        vstr = dm_strndup_trim(arg, sep - arg, DM_TRIM_BOTH);
    }
    else
    {
        vspec = NULL;
        vstr = dm_strdup(arg);
    }

    // Parse spec if any
    if (vspec != NULL)
    {
        BOOL vendianess = TRUE;
        char *vtmp = vspec;

        // Get endianess specifier, if any
        if (dm_strncasecmp(vtmp, "le", 2) == 0)
        {
            vendianess = TRUE;
            vtmp += 2;
        }
        else
        if (dm_strncasecmp(vtmp, "be", 2) == 0)
        {
            vendianess = FALSE;
            vtmp += 2;
        }

        // Get value bit size
        if (strcmp(vtmp, "8") == 0)
            vtype = DMGV_uint8;
        else
        if (strcmp(vtmp, "16") == 0)
            vtype = vendianess ? DMGV_uint16_le : DMGV_uint16_be;
        else
        if (strcmp(vtmp, "32") == 0)
            vtype = vendianess ? DMGV_uint32_le : DMGV_uint32_be;
        else
        {
            ret = dmError(DMERR_INVALID_ARGS,
                "Invalid grep type '%s'.\n",
                vspec);
            goto out;
        }
    }

    // Get value
    if (!dmGetIntVal(vstr, &vval, NULL))
    {
        ret = dmError(DMERR_INVALID_ARGS,
            "Not a valid integer value '%s'.\n",
            vstr);
        goto out;
    }

    // Check if we need to guess size
    if (vtype < 0)
    {
        for (int n = DMGV_last; n >= 0; n--)
        {
            const DMGrepDef *def = &dmGrepTypes[n];
            if (vval <= def->nmax)
                vtype = n;
        }
    }

    if (vtype < 0)
    {
        ret = dmError(DMERR_INVALID_ARGS,
            "Could not guess value type for '%s'.\n",
            arg);
        goto out;
    }

    // Check range
    if (vval > dmGrepTypes[vtype].nmax)
    {
        ret = dmError(DMERR_INVALID_ARGS,
            "Integer value %d <= %d <= %d out of range for type %s.\n",
            vval, 0, dmGrepTypes[vtype].nmax, dmGrepTypes[vtype].name);

        goto out;
    }

    if (nsetGrepValues < SET_MAX_VALUES)
    {
        DMGrepValue *node = &setGrepValues[nsetGrepValues++];
        node->type = vtype;
        node->value = vval;

        dmMsg(1, "Grep value %s : %d / 0x%x\n",
            dmGrepTypes[vtype].name,
            vval, vval);
    }
    else
    {
        ret = dmError(DMERR_BOUNDS,
            "Too many greps specified (max %d).",
            SET_MAX_VALUES);
    }

out:
    dmFree(vspec);
    dmFree(vstr);
    return ret;
}


BOOL argHandleOpt(const int optN, char *optArg, char *currArg)
{
    (void) optArg;

    switch (optN)
    {
        case 0:
            argShowHelp();
            exit(0);
            break;

        case 1:
            dmVerbosity++;
            break;

        case 2:
            return argParseGrepValue(optArg) == DMERR_OK;

        default:
            dmErrorMsg("Unknown argument '%s'.\n", currArg);
            return FALSE;
    }

    return TRUE;
}


BOOL argHandleNonOpt(char *currArg)
{
    if (nsrcFiles < SET_MAX_FILES)
    {
        DMSourceFile *file = &srcFiles[nsrcFiles++];
        file->filename = currArg;
        return TRUE;
    }
    else
    {
        dmErrorMsg("Maximum number of input files exceeded (%d).\n",
            SET_MAX_FILES);
        return TRUE;
    }
}


void dmInitStats(DMStats *stats)
{
    for (size_t n = 0; n < SET_MAX_ELEMS; n++)
    {
        stats->cv[n].count = 0;
        stats->cv[n].value = n;
    }
}


int dmCompareStatFunc(const void *va, const void *vb)
{
    const DMStatValue *pa = va, *pb = vb;
    return pb->count - pa->count;
}


void dmPrintStats(DMStats *stats, const int nmax, const size_t size)
{
    qsort(&stats->cv, SET_MAX_ELEMS, sizeof(DMStatValue), dmCompareStatFunc);

    for (int n = 0; n < nmax; n++)
    {
        printf("$%02x (%d = %1.2f%%), ",
            stats->cv[n].value,
            stats->cv[n].count,
            ((float) stats->cv[n].count * 100.0f) / (float) size);
    }
    printf("\n\n");
}


int main(int argc, char *argv[])
{
    DMCompElem *compBuf = NULL;
    size_t compBufSize = 0, totalSize = 0;
    int res;

    dmInitProg("fanalyze", "File format analyzer", "0.2", NULL, NULL);
    dmVerbosity = 1;

    dmInitStats(&totalStats);

    // Parse arguments
    if (!dmArgsProcess(argc, argv, optList, optListN,
        argHandleOpt, argHandleNonOpt, OPTH_BAILOUT))
        exit(1);

    if (nsrcFiles < 1)
    {
        dmErrorMsg("Nothing to do. (try --help)\n");
        goto out;
    }

    // Read input files
    for (int nfile = 0; nfile < nsrcFiles; nfile++)
    {
        DMSourceFile *file = &srcFiles[nfile];
        if ((res = dmReadDataFile(NULL, file->filename, &file->data, &file->size)) != DMERR_OK)
        {
            dmErrorMsg("Could not read '%s': %s\n",
                file->filename, dmErrorStr(res));
            goto out;
        }

        dmPrint(2, "Input #%d: '%s', %" DM_PRIu_SIZE_T " bytes.\n",
            nfile + 1, file->filename, file->size);

        if (!compBufSize || file->size < compBufSize)
            compBufSize = file->size;

        totalSize += file->size;
        dmInitStats(&file->stats);
    }


    // Check if we are in grep mode
    if (nsetGrepValues > 0)
    {
        for (int nfile = 0; nfile < nsrcFiles; nfile++)
        {
            DMSourceFile *file = &srcFiles[nfile];
            dmPrint(0, "\n%s:\n", file->filename);

            for (int n = 0; n < nsetGrepValues; n++)
            {
                DMGrepValue *node = &setGrepValues[n];
                const DMGrepDef *def = &dmGrepTypes[node->type];

                for (size_t offs = 0; offs + def->bsize < file->size; offs++)
                {
                    uint32_t mval = -1;
                    switch (node->type)
                    {
                        case DMGV_uint8:
                            mval = *((uint8_t *) (file->data + offs));
                            break;

                        case DMGV_uint16_le:
                            mval = DM_LE16_TO_NATIVE(*((uint16_t *) (file->data + offs)));
                            break;

                        case DMGV_uint16_be:
                            mval = DM_BE16_TO_NATIVE(*((uint16_t *) (file->data + offs)));
                            break;

                        case DMGV_uint32_le:
                            mval = DM_LE32_TO_NATIVE(*((uint32_t *) (file->data + offs)));
                            break;

                        case DMGV_uint32_be:
                            mval = DM_BE32_TO_NATIVE(*((uint32_t *) (file->data + offs)));
                            break;
                    }

                    if (mval == node->value)
                    {
                        dmPrint(0, "%08x : %s match %d / 0x%x\n",
                            offs, def->name, mval, mval);

                    }
                }
            }
        }
        goto out;
    }

    // Allocate comparision buffer
    // XXX: integer overflow?
    dmPrint(2, "Allocating %d element (%d bytes) comparision buffer.\n",
        compBufSize, compBufSize * sizeof(DMCompElem));

    if ((compBuf = dmCalloc(compBufSize, sizeof(DMCompElem))) == NULL)
    {
        dmErrorMsg("Out of memory. Could not allocate comparision buffer!\n");
        goto out;
    }

    // Begin analyzing ..
    dmPrint(2, "Analyzing ..\n");
    for (int nfile = 0; nfile < nsrcFiles; nfile++)
    {
        DMSourceFile *file = &srcFiles[nfile];

        for (size_t offs = 0; offs < file->size; offs++)
        {
            Uint8 bv = file->data[offs];
            totalStats.cv[bv].count++;
            file->stats.cv[bv].count++;
        }

        for (size_t offs = 0; offs < compBufSize; offs++)
        {
            Uint8 data = offs < file->size ? file->data[offs] : 0;
            compBuf[offs].stats[data]++;
        }
    }

    for (size_t offs = 0; offs < compBufSize; offs++)
    {
        DMCompElem *el = &compBuf[offs];
        for (int n = 0; n < SET_MAX_ELEMS; n++)
        {
            if (el->stats[n] > 0)
            {
                el->variants++;
                el->data = n;
            }
        }
    }

    // Display results
    for (size_t offs = 0, n = 0; offs < compBufSize; offs++)
    {
        DMCompElem *el = &compBuf[offs];
        BOOL var = el->variants > 1;

        if (n == 0)
            printf("%08" DM_PRIx_SIZE_T " | ", offs);

        if (var)
            printf("[%2d] ", el->variants);
        else
            printf(" %02x  ", el->data);

        if (++n >= 16)
        {
            printf("\n");
            n = 0;
        }
    }

    printf("\n");

    // Attempt further analysis
    for (int nfile = 0; nfile < nsrcFiles; nfile++)
    {
        DMSourceFile *file = &srcFiles[nfile];
        size_t len = file->size > compBufSize ? compBufSize : file->size;
        for (size_t offs = 0; offs + 4 < len; offs++)
        {
            DMCompElem *elem = &compBuf[offs];

            for (int variant = 3; variant >= 0; variant--)
            {
                size_t nmax = (variant < 2) ? sizeof(Uint16) : sizeof(Uint32);
                Uint32 tmp = 0;

                for (size_t n = 0; n < nmax; n++)
                {
                    size_t boffs = (variant & 1) ? n : nmax - n;

                    tmp <<= 8;
                    tmp |= file->data[offs + boffs];
                }

                if (file->size - tmp < 32)
                {
                    elem->interest[variant] += 32 - (file->size - tmp);
                    elem->interestF[variant]++;
                }
            }
        }
    }

    printf("\nMore findings:\n");
    for (size_t offs = 0; offs + 4 < compBufSize; offs++)
    {
        DMCompElem *elem = &compBuf[offs];

        for (int variant = 0; variant < 4; variant++)
        if (elem->interestF[variant] > 0)
        {
            printf("%08" DM_PRIx_SIZE_T " | V%d : %d / %d\n",
            offs, variant,
            elem->interestF[variant], elem->interest[variant]);
        }
    }

    printf("\nGlobal most used bytes:\n");
    dmPrintStats(&totalStats, 16, totalSize);

    for (int nfile = 0; nfile < nsrcFiles; nfile++)
    {
        DMSourceFile *file = &srcFiles[nfile];
        printf("Most used bytes for '%s':\n", file->filename);
        dmPrintStats(&file->stats, 16, file->size);
    }

out:
    for (int nfile = 0; nfile < nsrcFiles; nfile++)
    {
        DMSourceFile *file = &srcFiles[nfile];
        dmFree(file->data);
    }

    return 0;
}