/*-----------------------------------------------------------------------------

   QUASAR - q-gram Alignment based on Suffix ARrays

   Copyright (C) 1998 Stefan Burkhardt
   Author: Stefan Burkhardt <stburk@mpi-sb.mpg.de>
   This file is part of the QUASAR package.

   QUASAR is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   QUASAR is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the QUASAR package; see the file copying.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.  or contact the author. 

-------------------------------------------------------------------------------

  index structure module
  
  $File$
  $Revision: 1.4 $
  $Date: Wed, 29 Mar 2000 11:07:45 +0200 $

-----------------------------------------------------------------------------*/

#include "q_index.h"

Q_Index *NewIndex()
/*  Creates and initializes an index */
{
  Q_Index	*index;

  index = (Q_Index *)MyMalloc(sizeof(Q_Index), "index structure");
  
  index->sa = NULL;
  index->q = 0;
  index->n = 0;
  index->sr = NULL;

  return index;
}

void DeleteIndex(Q_Index *index)
/* Frees all memory allocated for index */
{
  if(index->sa != NULL)
    free(index->sa);
  if(index->sr != NULL)
    free(index->sr);
  free(index);
}

static void print_qgram(LONG qgram, int q)
{
  int		i;
  static char          alphabet[] = "ACGT";

  for(i=0; i<q; i++) {
    printf("%c", alphabet[(qgram>>(2*(q-i-1)))%4]);
  }
  printf("\n");
}

static char *WriteQgramToString(int q, LONG qgram)
{
  LONG           i;
  static char   alphabet[] = "ACGT";
  char		*string;

  string = MyMalloc(sizeof(char) * (q+1), "tuple string");

  for(i=0; i<q; i++) 
    string[q-i-1] = alphabet[(qgram>>(2*i))%4];
  string[q] = '\0';
  return string;
}


Q_Index *ReadIndex(char *infile, int q)
/*  Read the index from infile.sa and read precomputed searches for q-grams
    of length q from infile.q */
{
  FILE		*index_file;
  Q_Index	*index;
  
  char		*sr_file;
  LONG		length = 0;

  index = NewIndex();
  index->q = q;
  index->n = 1<<(2*q);
  length = sizeof(int) * (1 + index->n);
  index->sr = MyMalloc(length, "search results");

  sr_file = MyMalloc(sizeof(char) * (strlen(infile)+20), "index filename");
  sprintf(sr_file, "%s.sr%d", infile, q);
  printf("Reading search results from file %s (%lld bytes)\n", sr_file, length);
  index_file = MyFopen(sr_file, "r");
  MyFread(index->sr, length, index_file);
  fclose(index_file);

  length = (LONG)sizeof(int) * (LONG)index->sr[index->n];
  index->sa = MyMalloc(length, "suffix array");
  
  sprintf(sr_file, "%s.csa%d", infile, q);
  printf("Reading compressed suffix array from: %s (%lld bytes)\n", sr_file, length);
  index_file = MyFopen(sr_file, "r");
  MyFread(index->sa, length, index_file);
  fclose(index_file);

  free(sr_file);

  PrintIndexInfo(index);

  return index;
}

int *GetHits(Q_Index *index, LONG qgram, int *hits)
/*  Get the hitlist for qgram, write its length to hits and return a 
    pointer to the start of the hitlist */
{
  *hits = index->sr[qgram+1] - index->sr[qgram];
#ifdef INDEX_DEBUG
  printf("qgram %llx has %d hits\n", qgram, hits[0]);
  printf("Position: %d - %d \n", index->sr[qgram], index->sr[qgram+1]);
  printf("Address: %d \n", &index->sa[index->sr[qgram]]); 
#endif

#ifdef DEBUG
  if(index->sr[qgram] < 0 || index->sr[qgram] > index->sr[1<<(2*index->q)])
    printf("Out of Range!!!\n");
#endif
  
  return &index->sa[index->sr[qgram]];
}

int BinSearch(Q_SSet *sset, LONG qgram, int q, Q_Index *index, int *left, int *len)
{
  int	l = 0;
  int   r = *len-1;
  int   m;
  int	final_left = 0;

  char *seq;

  seq = WriteQgramToString(q, qgram);
      
  while(l<=r) {
    m = (l+r)>>1;
#ifdef DEBUG    
    printf("l: %d, r: %d, m: %d, index->sa: %d\n", l, r, m, index->sa);
    printf("offset: %d\n", left[m]);
    getchar();
#endif
    if(strncmp(sset->data+left[m], seq, q) >= 0)
      r = m-1;
    else
      l = m+1;
  }
  final_left = l;
  r = *len;
  while(l<=r) {
    m = (l+r)>>1;
    if(strncmp(sset->data+left[m], seq, q) <=0)
      l = m+1;
    else
      r = m-1;
  }
  *len = 1+r-final_left;
  free(seq);
  return final_left;
}

int GetLongHits(Q_SSet *sset, char *seq, Q_Index *index, int q, LONG qgram, int *hits, int *n)
/*  Get the hitlist for qgram where qgram is of length q but still use
    the index (which contains precomputed searches for shorter qgrams ).
    hits is a pointer to the leftmost occurence of the shorter q-grams
    and n is the number of occurences of the shorter q-grams. A binary
    search between these two hits is performed to find the real *hits and n
    these are set accordingly */
{
  if(q == index->q)
    printf("ERROR!!! q == index->q\n");
  if(q < index->q)
    printf("CHANGE!!! SHOULD ALSO BE ABLE TO HANDLE SHORTER Q-GRAMS\n");
  if(q > index->q) {
    
#ifdef DEBUG
    printf("qgram %lld hits start at %d (%d hits)\n", qgram, *hits, *n);
    getchar();
#endif    

    return BinSearch(sset, qgram, q, index, hits, n);
  }
  return 0;
}


void PrintIndexInfo(Q_Index *index)
/*  Prints index information */
{
  printf("\nPRINTING INDEX INFORMATION:\n");
  printf("Tuple length: %d\n", index->q);
  printf("Number of tuples: %d\n", 1<<(2*index->q));
}

void PrintIndex(Q_Index *index)
/*  Prints complete index */
{
  int		i;

  PrintIndexInfo(index);
  printf("Search results: \n");
  for(i=0; i<=1<<(2*index->q); i++)
    printf("%d: %d\n", i, index->sr[i]);
  for(i=0; i<=index->sr[1<<(2*index->q)]; i++)
    printf("%d: %d\n", i, index->sa[i]);
}

void PrintIndexStatistics(Q_Index *index)
/*  Prints index statistics */
{  
  printf("q-gram length: %d\n", index->q);
  /*TODO*/
}


