/*===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
*/

#include "sra-stat.vers.h"

#include <kapp/main.h>

#include <sra/sradb.h>
#include <sra/sradb-priv.h>
#include <sra/types.h>

#include <kdb/table.h>
#include <kdb/kdb-priv.h>

#include <kfs/directory.h>

#include <klib/container.h>
#include <klib/log.h>
#include <klib/out.h>
#include <klib/debug.h> /* DBGMSG */
#include <klib/rc.h>

#include <assert.h>
#include <stdlib.h>
#include <string.h>

#define DISP_RC(rc, msg) (void)((rc == 0) ? 0 : LOGERR(klogInt, rc, msg))

#define DISP_RC2(rc, name, msg) (void)((rc == 0) ? 0 : \
    PLOGERR(klogInt, (klogInt, rc, "$(name): $(msg)", "name=%s, msg=%s", name, msg)))

#define DISP_RC_Read(rc, name, spot, msg) (void)((rc == 0) ? 0 : \
    PLOGERR(klogInt, (klogInt, rc, "column $(name), spot $(spot): $(msg)", "name=%s, spot=%lu, msg=%s", name, msg)))

typedef struct srastat_parms {
    const char* table_path;

    bool xml; /* output format (txt or xml) */

    spotid_t start, stop;

    bool hasSPOT_GROUP;
    bool variableReadLength;
} srastat_parms;
typedef struct SraStats {
    BSTNode n;
    char     spot_group[1024]; /* SPOT_GROUP Column */
    uint64_t spot_count; /*** spot count ***/
    uint64_t spot_count_mates; /*** spots with mates ***/
    uint64_t bio_len; /** biological len **/
    uint64_t bio_len_mates; /** biological len when mates are present**/
    uint64_t total_len; /** total len **/
    uint64_t bad_spot_count; /** number of spots flagged with rd_filter=2 **/
    uint64_t bad_bio_len;        /** biological length of bad spots ***/
    uint64_t filtered_spot_count; /** number of spots flagged with rd_filter=2 **/ 
    uint64_t filtered_bio_len;   /** biological length of filtered spots **/
} SraStats;
typedef struct SraSizeStats { uint64_t size; } SraSizeStats;
static
rc_t CC visitor(const KDirectory* dir, uint32_t type, const char* name, void* data)
{
    rc_t rc = 0;

    SraSizeStats* sizes = (SraSizeStats*) data;

    if (type & kptAlias)
    {   return rc; }

    assert(sizes);

    switch (type) {
        case kptFile: {
            uint64_t size = 0;
            rc = KDirectoryFileSize(dir, &size, name);
            DISP_RC2(rc, name, "while calling KDirectoryFileSize");
            if (rc == 0) {
                sizes->size += size;
                DBGMSG(DBG_APP, DBG_COND_1, ("File '%s', size %lu\n", name, size));
            }
            break;
        }
        case kptDir: 
            DBGMSG(DBG_APP, DBG_COND_1, ("Dir '%s'\n", name));
            rc = KDirectoryVisit(dir, false, visitor, sizes, name);
            DISP_RC2(rc, name, "while calling KDirectoryVisit");
            break;
        default:
            rc = RC(rcExe, rcDirectory, rcVisiting, rcType, rcUnexpected);
            DISP_RC2(rc, name, "during KDirectoryVisit");
            break;
    }

    return rc;
}

static
rc_t get_size(const SRATable* tbl, SraSizeStats* sizes)
{
    rc_t rc = 0;

    const KTable* kTbl = NULL;
    const KDirectory* dir = NULL;

    assert(tbl && sizes);

    rc = SRATableGetKTableRead(tbl, &kTbl);
    DISP_RC(rc, "while calling SRATableGetKTableRead");

    if (rc == 0) {
        rc = KTableOpenDirectoryRead(kTbl, &dir);
        DISP_RC(rc, "while calling KTableOpenDirectoryRead");
    }

    memset(sizes, 0, sizeof *sizes);

    if (rc == 0) {
        rc = KDirectoryVisit(dir, false, visitor, sizes, NULL);
        DISP_RC(rc, "while calling KDirectoryVisit");
    }

    {
        rc_t rc2 = KDirectoryRelease(dir);
        if (rc == 0)
        {   rc = rc2; }
        dir = NULL;
    }
    {
        rc_t rc2 = KTableRelease(kTbl);
        if (rc == 0)
        {   rc = rc2; }
        kTbl = NULL;
    }

    return rc;
}

static
void CC srastat_print ( BSTNode *n, void *data )
{
   const srastat_parms *pb = (const srastat_parms*) data;
   const SraStats *ss = ( const SraStats* ) n;
   assert(pb && ss);
   if (pb->xml) {
        if (pb->hasSPOT_GROUP) {
            OUTMSG(("  <Member member_name=\"%s\"", ss->spot_group));
        }
        OUTMSG((" spot_count=\"%ld\" base_count=\"%ld\"", ss->spot_count, ss->total_len));
        OUTMSG((" base_count_bio=\"%ld\"", ss->bio_len));
        OUTMSG((" spot_count_mates=\"%ld\" base_count_bio_mates=\"%ld\"", ss->spot_count_mates, ss->bio_len_mates));
        OUTMSG((" spot_count_bad=\"%ld\" base_count_bio_bad=\"%ld\"", ss->bad_spot_count, ss->bad_bio_len));
        OUTMSG((" spot_count_filtered=\"%ld\" base_count_bio_filtered=\"%ld\"", ss->filtered_spot_count, ss->filtered_bio_len));
        if (pb->hasSPOT_GROUP) {
            OUTMSG(("/>\n"));
        }
    }
    else {
        OUTMSG(("%s|%s|%ld:%ld:%ld|%ld:%ld|%ld:%ld|%ld:%ld\n",
            pb->table_path,ss->spot_group,ss->spot_count,ss->total_len,ss->bio_len,ss->spot_count_mates,ss->bio_len_mates,
            ss->bad_spot_count,ss->bad_bio_len,ss->filtered_spot_count,ss->filtered_bio_len));
    }
}

static
void CC print_results(const BSTree* tr, srastat_parms* pb, const SraSizeStats* sizes)
{
    assert(pb && tr);

    if (pb->xml) {
        OUTMSG(("<Run accession=\"%s\" read_length=\"%s\"", pb->table_path, pb->variableReadLength ? "variable" : "fixed"));
        if (pb->hasSPOT_GROUP)
        {   OUTMSG((">\n")); }
    }

    BSTreeForEach(tr, false, srastat_print, pb);

    if (pb->xml) {
        if (!pb->hasSPOT_GROUP)
        {   OUTMSG((">")); }
        if (sizes) {
            if (!pb->hasSPOT_GROUP)
            {   OUTMSG(("\n")); }
            OUTMSG(("  <Size value=\"%lu\" units=\"bytes\"/>\n", sizes->size));
        }
        OUTMSG(("</Run>\n"));
    }
}

static
void CC bst_whack_free ( BSTNode *n, void *ignore )
{
    free ( n );
}

static
int CC srastats_cmp ( const void *item, const BSTNode *n )
{
    const char *sg = item;
    const SraStats *ss = ( const SraStats* ) n;

    return strcmp(sg,ss->spot_group);
}

static 
int CC srastats_sort ( const BSTNode *item, const BSTNode *n )
{
    const SraStats *ss = ( const SraStats* ) item;
    return srastats_cmp(ss->spot_group,n);
}

static
rc_t sra_stat(srastat_parms* pb, const SRATable* tbl, const SraSizeStats* sizes)
{
    rc_t rc = 0;

    const char READ_LEN  [] = "READ_LEN";
    const char READ_TYPE [] = "READ_TYPE";
    const char RD_FILTER [] = "RD_FILTER";
    const char SPOT_GROUP[] = "SPOT_GROUP";

    const SRAColumn* cREAD_LEN = NULL;
    const SRAColumn* cREAD_TYPE = NULL;
    const SRAColumn* cSPOT_GROUP = NULL;
    const SRAColumn* cRD_FILTER = NULL;

    if (rc == 0) {
        const char* name = READ_LEN;
        rc = SRATableOpenColumnRead(tbl, &cREAD_LEN, name, vdb_uint32_t);
        DISP_RC2(rc, name, "while calling SRATableOpenColumnRead");
    }
    if (rc == 0) {
        const char* name = READ_TYPE;
        rc = SRATableOpenColumnRead(tbl, &cREAD_TYPE, name, sra_read_type_t);
        DISP_RC2(rc, name, "while calling SRATableOpenColumnRead");
    }

    if (rc == 0) {
        {
            const char* name = SPOT_GROUP;
            rc = SRATableOpenColumnRead(tbl, &cSPOT_GROUP, name, vdb_ascii_t);
            if (GetRCState(rc) == rcNotFound)
            {   rc = 0; }
            DISP_RC2(rc, name, "while calling SRATableOpenColumnRead");
        }
        if (rc == 0) {
            {
                const char* name = RD_FILTER;
                rc = SRATableOpenColumnRead(tbl, &cRD_FILTER, name, sra_read_filter_t);
                if (GetRCState(rc) == rcNotFound)
                {   rc = 0; }
                DISP_RC2(rc, name, "while calling SRATableOpenColumnRead");
            }
            if (rc == 0) {
                spotid_t spotid;
                pb->hasSPOT_GROUP = 0;
                rc = SRATableMaxSpotId(tbl, &spotid);
                DISP_RC(rc, "failed to read max spot id");
                if (rc == 0) {
                    bool fixedReadLength = true;
                    int g_nreads = 0;
                    uint32_t g_dREAD_LEN[255];

                    BSTree tr;

                    memset(g_dREAD_LEN, 0, sizeof g_dREAD_LEN);
                    BSTreeInit(&tr);

                    if (pb->start == 0)
                    {   pb->start = 1; }
                    if (pb->stop == 0 || pb -> stop > spotid)
                    {   pb->stop = spotid; }

                    for (spotid = pb->start; spotid <= pb->stop && (rc == 0); ++spotid) {
                        SraStats* ss;
                        uint32_t dREAD_LEN[255];
                        uint8_t dREAD_TYPE[255];
                        uint8_t dRD_FILTER[255];
                        char   dSPOT_GROUP[1024] = "NULL";

                        const void* base;
                        bitsz_t boff, row_bits;
                        int nreads;

                        rc = Quitting();
                        if (rc)
                        {   LOGMSG(klogWarn, "Interrupted"); }

                        if (rc == 0) {
                            rc = SRAColumnRead(cREAD_LEN, spotid, &base, &boff, &row_bits);
                            DISP_RC_Read(rc, READ_LEN, spotid, "while calling SRAColumnRead");
                        }
                        if (rc == 0) {
                            if (boff & 7)
                            {   rc = RC(rcExe, rcColumn, rcReading, rcOffset, rcInvalid); }
                            if ((row_bits >> 3) > sizeof(dREAD_LEN))
                            {   rc = RC(rcExe, rcColumn, rcReading, rcBuffer, rcInsufficient); }
                            DISP_RC_Read(rc, READ_LEN, spotid, "after calling SRAColumnRead");
                        }
                        if (rc == 0) {
                            int i, bio_len, bio_count, bad_cnt, filt_cnt;
                            memcpy(dREAD_LEN, ((const char*)base) + (boff>>3), row_bits>>3);
                            nreads = (row_bits >> 3) / sizeof(*dREAD_LEN);
                            if (spotid == pb->start) {
                                g_nreads = nreads;
                            }
                            else if (g_nreads != nreads) {
                                rc = RC(rcExe, rcTable, rcReading, rcData, rcInconsistent);
                                PLOGERR(klogInt, (klogInt, rc,
                                    "spot=$(spot), ReadNumber=$(n), previous=$(prev)", "spot=%lu, n=%d, prev=%d", spotid, nreads, g_nreads));
                            }

                            if (rc == 0) {
                                rc = SRAColumnRead(cREAD_TYPE, spotid, &base, &boff, &row_bits);
                                DISP_RC_Read(rc, READ_TYPE, spotid, "while calling SRAColumnRead");
                                if (rc == 0) {
                                    if (boff & 7)
                                    {   rc = RC(rcExe, rcColumn, rcReading, rcOffset, rcInvalid); }
                                    if ((row_bits >> 3) > sizeof(dREAD_TYPE))
                                    {   rc = RC(rcExe, rcColumn, rcReading, rcBuffer, rcInsufficient); }
                                    if ((row_bits >> 3) !=  nreads)
                                    {   rc = RC(rcExe, rcColumn, rcReading, rcData, rcIncorrect); }
                                    DISP_RC_Read(rc, READ_TYPE, spotid, "after calling SRAColumnRead");
                                }
                            }
                            if (rc == 0) {
                                memcpy(dREAD_TYPE, ((const char*)base) + (boff >> 3), row_bits >> 3);
                                if (cSPOT_GROUP) {
                                    rc = SRAColumnRead(cSPOT_GROUP, spotid, &base, &boff, &row_bits);
                                    DISP_RC_Read(rc, READ_TYPE, spotid, "while calling SPOT_GROUP");
                                    if (rc == 0) {
                                        if (row_bits > 0) {
                                            pb -> hasSPOT_GROUP = 1;
                                            if (boff & 7)
                                            {   rc = RC(rcExe, rcColumn, rcReading, rcOffset, rcInvalid); }
                                            if ((row_bits >> 3) > sizeof(dSPOT_GROUP))
                                            {   rc = RC(rcExe, rcColumn, rcReading, rcBuffer, rcInsufficient); }
                                            DISP_RC_Read(rc, SPOT_GROUP, spotid, "after calling SRAColumnRead");
                                            if (rc == 0) {
                                                memcpy(dSPOT_GROUP,((const char*)base) + (boff>>3),row_bits>>3);
                                                dSPOT_GROUP[row_bits>>3]='\0';
                                            }
                                        }
                                        else {  dSPOT_GROUP[0]='\0'; }
                                    } else { break; }
                                }
                            }
                            if (rc == 0) {
                                if (cRD_FILTER) {
                                    rc = SRAColumnRead(cRD_FILTER, spotid, &base, &boff, &row_bits);
                                    DISP_RC_Read(rc, READ_TYPE, spotid, "while calling RD_FILTER");
                                    if (rc == 0) {
                                        if (boff & 7)
                                        {   rc = RC(rcExe, rcColumn, rcReading, rcOffset, rcInvalid); }
                                        if ((row_bits >> 3) > sizeof(dRD_FILTER))
                                        {   rc = RC(rcExe, rcColumn, rcReading, rcBuffer, rcInsufficient); }
                                        DISP_RC_Read(rc, READ_TYPE, spotid, "after calling RD_FILTER");
                                        if (rc == 0)
                                        {   memcpy(dRD_FILTER,((const char*)base) + (boff>>3),row_bits>>3); }
                                    } else { break; }
                                }

                                ss = (SraStats*)BSTreeFind(&tr, dSPOT_GROUP, srastats_cmp);
                                if (ss == NULL) {
                                    ss = calloc(1, sizeof(*ss));
                                    if (ss == NULL) {
                                        rc = RC(rcExe, rcStorage, rcAllocating, rcMemory, rcExhausted);
                                        break;
                                    }
                                    else {
                                        strcpy(ss->spot_group, dSPOT_GROUP);
                                        BSTreeInsert(&tr, (BSTNode*)ss, srastats_sort);
                                    }
                                }
                                ss->spot_count++;

                                for (bio_len = bio_count = i = bad_cnt = filt_cnt = 0; (i < nreads) && (rc == 0); i++) {
                                    if (spotid == pb->start) {
                                        g_dREAD_LEN[i] = dREAD_LEN[i];
                                    }
                                    else if (g_dREAD_LEN[i] != dREAD_LEN[i]) { fixedReadLength = false; }

                                    if (dREAD_LEN[i] > 0) {
                                        ss->total_len += dREAD_LEN[i];
                                        if ((dREAD_TYPE[i] & SRA_READ_TYPE_BIOLOGICAL) != 0) {
                                            bio_len += dREAD_LEN[i];
                                            bio_count++;
                                            if (cRD_FILTER) {
                                                switch (dRD_FILTER[i]) {
                                                    case SRA_READ_FILTER_PASS:
                                                        break;
                                                    case SRA_READ_FILTER_REJECT:
                                                    case SRA_READ_FILTER_CRITERIA:
                                                        ss->bad_bio_len += dREAD_LEN[i]; 
                                                        bad_cnt++;
                                                        break;
                                                    case SRA_READ_FILTER_REDACTED:
                                                        ss->filtered_bio_len += dREAD_LEN[i];
                                                        filt_cnt++;
                                                        break;
                                                    default:
                                                        rc = RC(rcExe, rcColumn, rcReading, rcData, rcUnexpected);
                                                        PLOGERR(klogInt, (klogInt, rc,
                                                            "spot=$(spot), read=$(read), READ_FILTER=$(val)", "spot=%lu, read=%d, val=%d",
                                                            spotid, i, dRD_FILTER[i]));
                                                        break;
                                                }
                                            }
                                        }
                                    }
                                }
                                ss->bio_len += bio_len;
                                if (bio_count > 1) {
                                    ss->spot_count_mates++;
                                    ss->bio_len_mates += bio_len;
                                }
                                if (bad_cnt)
                                {   ss->bad_spot_count++; }
                                if (filt_cnt)
                                {   ss->filtered_spot_count++; }
                            }
                        }
                    }
    /******* Output Results here ************/

                    if (rc == 0) {
                        pb->variableReadLength = !fixedReadLength;
                        print_results(&tr, pb, sizes);
                    }

                    BSTreeWhack(&tr, bst_whack_free, NULL);
                }
                SRAColumnRelease(cSPOT_GROUP);
            }
            SRAColumnRelease(cRD_FILTER);
        }
        SRAColumnRelease(cREAD_LEN);
        SRAColumnRelease(cREAD_TYPE);
    }

    return rc;
}

static
rc_t run(srastat_parms* pb)
{
    rc_t rc = 0;
    const SRAMgr* mgr = NULL;

    assert(pb && pb->table_path);

    rc = SRAMgrMakeRead(&mgr);

    if (rc != 0) {
        LOGERR(klogInt, rc, "failed to open SRAMgr");
    }
    else {
        SraSizeStats sizes;
        const SRATable* tbl = NULL;

        rc = SRAMgrOpenTableRead(mgr, &tbl, pb->table_path);
        if (rc != 0) {
            PLOGERR(klogInt, (klogInt, rc, "failed to open SRATable '$(spec)'", "spec=%s", pb->table_path));
        }
        else {
            rc = get_size(tbl, &sizes);

            if (rc == 0)
            {   rc = sra_stat(pb, tbl, &sizes); }
        }
        {
            rc_t rc2 = SRATableRelease(tbl);
            if (rc == 0)
            {   rc = rc2; }
        }
    }

    {
        rc_t rc2 = SRAMgrRelease(mgr);
        if (rc == 0)
        {   rc = rc2; }
    }

    return rc;
}

/* Version  EXTERN
 *  return 4-part version code: 0xMMmmrrrr, where
 *      MM = major release
 *      mm = minor release
 *    rrrr = bug-fix release
 */
ver_t CC KAppVersion ( void )
{
    return SRA_STAT_VERS;
}


/* Usage
 */
#define OPTION_XML   "xml"
#define OPTION_START "start"
#define OPTION_STOP  "stop"

#define ALIAS_XML   "x"
#define ALIAS_START "b"
#define ALIAS_STOP  "e"

static const char * xml_usage[] = { "output as XML (default is text)", NULL };
static const char * start_usage[] = { "starting spot id ( default 1 )", NULL };
static const char * stop_usage[] = { "ending spot id ( default max )", NULL };
static
OptDef Options[] = 
{
    { OPTION_XML,   ALIAS_XML,   NULL, xml_usage,   0, false, false },
    { OPTION_START, ALIAS_START, NULL, start_usage, 1, true,  false },
    { OPTION_STOP,  ALIAS_STOP,  NULL, stop_usage,  1, true,  false }
};

static
void summary (const char * progname)
{
    OUTMSG (("\n"
             "Usage:\n"
             "  %s [options] table\n"
             "\n"
             "Summary:\n"
             "  Display table statistics\n"
             "\n", progname));
}

static
const char def_name[] = "sra-stat";
rc_t CC Usage (const Args * args)
{
    const char * progname;
    const char * fullpath;
    rc_t rc = 0;

    if (args == NULL)
        rc = RC (rcApp, rcArgv, rcAccessing, rcSelf, rcNull);
    else
        rc = ArgsProgram (args, &fullpath, &progname);
    if (rc)
        progname = fullpath = def_name;

    summary (progname);

    OUTMSG (("Options:\n"));

    HelpOptionLine (ALIAS_XML, OPTION_XML, NULL, xml_usage);
    HelpOptionLine (ALIAS_START, OPTION_START, "row-id", start_usage);
    HelpOptionLine (ALIAS_STOP, OPTION_STOP, "row-id", stop_usage);
    HelpOptionsStandard ();
    HelpVersion (fullpath, KAppVersion());
    return rc;
}


rc_t MiniUsage (const Args * args)
{
    const char * progname = "sra-dump";

    ArgsArgvValue (args, 0, &progname);

    KOutMsg ( "\n"
              "Usage: %s [options] table\n"
              "    run with option '--help' for help\n",
              progname);
    return 0;
}

rc_t CC Version ( const Args * args )
{
    const char * progname = "sra-stat";
    rc_t rc = 0;
    ver_t version;

    rc = ArgsArgvValue(args, 0, &progname);
    version = KAppVersion();

    KOutMsg ( "%s: %V\n", progname, version);

    return 0;
}


/* KMain - EXTERN
 *  executable entrypoint "main" is implemented by
 *  an OS-specific wrapper that takes care of establishing
 *  signal handlers, logging, etc.
 *
 *  in turn, OS-specific "main" will invoke "KMain" as
 *  platform independent main entrypoint.
 *
 *  "argc" [ IN ] - the number of textual parameters in "argv"
 *  should never be < 0, but has been left as a signed int
 *  for reasons of tradition.
 *
 *  "argv" [ IN ] - array of NUL terminated strings expected
 *  to be in the shell-native character set: ASCII or UTF-8
 *  element 0 is expected to be executable identity or path.
 */
rc_t CC KMain ( int argc, char *argv [] )
{
    Args* args = NULL;
    rc_t rc = 0;

    srastat_parms pb;
    memset(&pb, 0, sizeof pb);

    rc = ArgsMakeAndHandle(&args, argc, argv, 1, Options, sizeof Options / sizeof (OptDef));
    if (rc == 0)
    {
        do
        {
            uint32_t pcount;
            const char* pc;

            rc = ArgsOptionCount (args, OPTION_START, &pcount);
            if (rc)
                break;

            if (pcount == 1)
            {
                rc = ArgsOptionValue (args, OPTION_START, 0, &pc);
                if (rc)
                    break;

                pb.start = AsciiToU32 (pc, NULL, NULL);
            }

            rc = ArgsOptionCount (args, OPTION_STOP, &pcount);
            if (rc)
                break;

            if (pcount == 1)
            {
                rc = ArgsOptionValue (args, OPTION_STOP, 0, &pc);
                if (rc)
                    break;

                pb.stop = AsciiToU32 (pc, NULL, NULL);
            }

            rc = ArgsOptionCount (args, OPTION_XML, &pcount);
            if (rc)
                break;

            if (pcount)
                pb.xml = true;

            rc = ArgsParamCount (args, &pcount);
            if (rc)
                break;

            if (pcount == 0)
                return MiniUsage (args);

            rc = ArgsParamValue (args, 0, &pb.table_path);
            if (rc)
                break;
        } while (0);
    }

    if (rc == 0)
    {   rc = run(&pb); }

    {
        rc_t rc2 = ArgsWhack(args);
        if (rc == 0)
        {   rc = rc2; }
    }

    return rc;
}
