/*-----------------------------------------------------------------------------

   QUASAR - q-gram Alignment based on Suffix ARrays

   Copyright (C) 1998 Stefan Burkhardt
   Author: Stefan Burkhardt <stburk@mpi-sb.mpg.de>
   This file is part of the QUASAR package.

   QUASAR is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   QUASAR is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the QUASAR package; see the file copying.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.  or contact the author. 

-------------------------------------------------------------------------------

  sequence set module
  
  $File$
  $Revision: 1.6 $
  $Date: Wed, 16 Feb 2000 16:16:22 +0100 $

-----------------------------------------------------------------------------*/

#include "q_sset.h"

Q_SSet *NewSSet()
/*  Creates and initializes a sequence set datastructure */
{
  Q_SSet	*sset;

  sset = (Q_SSet *)MyMalloc(sizeof(Q_SSet), "sequence set");

  sset->size = 0;
  sset->offsets = NULL;
  sset->data = NULL;

  return sset;
}

void DeleteSSet(Q_SSet *sset)
/*  Frees Memory allocated for a SSET */
{
  if(sset->offsets != NULL)
    free(sset->offsets);
  if(sset->data != NULL)
    free(sset->data);
  free(sset);
}


Q_SSet *ReadSSet(char *infile)
/*  Read a sequence set from the file named infile.sset */
{
  FILE		*sset_file;
  Q_SSet	*sset;
  char		*filename;

#ifdef SSET_DEBUG
  char		diff_command[1000];
#endif
  
  filename = MyMalloc(strlen(infile)+6, "sset filename");
  sprintf(filename, "%s.sset", infile);
  printf("Reading sequence set from file: %s\n", filename);
  sset_file = MyFopen(filename, "r");

  sset = NewSSet();

  MyFread(&sset->size, sizeof(int), sset_file);
  
  sset->offsets = MyMalloc(sizeof(int) * (sset->size+1), "sequence offsets");
  MyFread(sset->offsets, sizeof(int) * (sset->size+1), sset_file);
  
  sset->data = MyMalloc(sset->offsets[sset->size], "sequence data");
  MyFread(sset->data, sset->offsets[sset->size], sset_file);

  fclose(sset_file);
  
#ifdef SSET_DEBUG
  PrintSSetInfo(sset);
  printf("Diffing %s with original file\n");
  getchar();
  WriteSSet(db, "tmp_sset_file");
  sprintf(diff_command,"diff tmp_sset_file ");
  memcpy(diff_command+19, argv[3], strlen(argv[3])-5);
  system(diff_command);
  getchar();
#endif

  free(filename);

  return sset;
}
char *GetReverseSSetSequence(Q_SSet *sset, int seq)
/*  return the reverse complement sequence of seq in sset terminated
    by \0. allocation is done in this function */
{
  int	i;
  char	*rquery;
  static int            convert[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
                                       0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 3};
  static char           alphabet[] = "TGCA";

  rquery = MyMalloc(sizeof(char) * (1+Bases(sset, seq)), "reverse query");
  for(i=0; i<Bases(sset, seq); i++) {
    rquery[i] = alphabet[convert[(int)*(Sequence(sset,seq+1)-2-i)]];
  }
  rquery[Bases(sset,seq)] = '\0';
  return rquery;
}

LONG *GetSSetQgrams(Q_SSet *sset, int seq, Q_Options *op)
/*  write all i-gram numbers for i-grams in the sequence with number 
    seq in sset to an integer array and return it write the number of
    tuples in the first array position. i is the length of the q-grams
    used in the index. q is the real length of the q-grams used in the
    search */
{
  LONG		*qgrams;
  LONG		qgram = 0;
  int		bases;
  int		i;
  char		*cptr;
  LONG		mask = 1;
  
  if(seq<0 || seq>=sset->size) {
    printf("ERROR: seqeuence number out of range\n");
    getchar();
    return NULL;
  }

  mask = (mask<<(2*op->q))-1;

  /*  number of bases in this sequence */
  bases = Bases(sset, seq);

  qgrams = MyMalloc(sizeof(LONG)*(1+bases), "query tuple array");
  qgrams[0] = bases - op->q + 1;      /* number of qgrams  in this sequence */

  if(qgrams[0] <= 0) {
    printf("ERROR: Sequence too short!");
    getchar();
    return qgrams;
  }

#ifdef DEBUG
  printf("\nseq %d: %d bases, %lld %d-grams\n",seq, bases, qgrams[0], op->q);
#endif

  cptr = Sequence(sset, seq);

  for(i=0; i<bases; i++) {
    switch (*cptr) {
    case 'A': 
      break;
    case 'C':
      qgram++;
      break;
    case 'G':
      qgram+=2;
      break;
    case 'T':
      qgram+=3;
      break;
    default:
      printf("ERROR: Invalid character encountered in sequence %d!\n", seq);
      break;
    }
    if(i-op->q+1>=0) {
      qgrams[i-op->q+2] = qgram & mask;
#ifdef SSET_DEBUG
      printf("tuple %d: %lld\n", i-op->q+1, qgrams[i-op->q+2]);
      printf("short tuple: %lld\n", qgrams[i-op->q+2] >> (2*(op->q - op->i)));
      if(i-op->q+1 % 100 == 0)
	getchar();
#endif      
    }
    cptr++;
    qgram = qgram << 2;
  }

#ifdef SSET_DEBUG
  printf("\n");
#endif  

#ifdef DEBUG
  printf("query sequence: "); 
  PrintSeqFromQgrams(i, qgrams);
#endif

  return qgrams;
}

LONG *GetReverseSSetQgrams(Q_SSet *sset, int seq, Q_Options *op)
/*  write all q-gram numbers for q-grams in the reverse complement of
    the sequence with number seq in sset to an integer array and return 
    it. write the number of tuples in the first array position */
{
  LONG		*qgrams;
  LONG		qgram = 0;
  int		bases;
  int		i;
  char		*cptr;
  LONG		mask = 1;
  
  if(seq<0 || seq>=sset->size) {
    printf("ERROR: seqeuence number out of range\n");
    getchar();
    return NULL;
  }

  mask = (mask<<(2*op->q))-1;
#ifdef SSET_DEBUG
  printf("Mask: %d\n", mask);
#endif

  /*  number of bases in this sequence */
  bases = Bases(sset, seq);

  qgrams = MyMalloc(sizeof(LONG)*(1+bases), "query tuple array");
  qgrams[0] = bases - op->q + 1;   /* number of qgrams  in this sequence */

  if(qgrams[0] <= 0) {
    printf("ERROR: Sequence too short!");
    getchar();
    return qgrams;
  }

#ifdef DEBUG
  printf("\nSequence %d: %d bases, %d %d-grams\n",seq, bases, qgrams[0], q);
#endif

  cptr = sset->data+End(sset, seq)-1;

  for(i=0; i<bases; i++) {
    switch (*cptr) {
    case 'T': 
      break;
    case 'G':
      qgram++;
      break;
    case 'C':
      qgram+=2;
      break;
    case 'A':
      qgram+=3;
      break;
    default:
      printf("ERROR: Invalid character encountered in sequence %d!\n", seq);
      break;
    }
    if(i-op->q+1>=0) {
      qgrams[i-op->q+2] = qgram & mask;
#ifdef SSET_DEBUG
      printf("tuple %d: %d\n", i-op->q+1, qgrams[i-op->q+2]);
      if(i-op->q+1 % 100 == 0)
	getchar();
#endif      
    }
    cptr--;
    qgram = qgram << 2;
  }

#ifdef SSET_DEBUG
  printf("\n");
#endif  

#ifdef DEBUG
  printf("rquery seq:");
  PrintSeqFromQgrams(q, qgrams);
#endif


  return qgrams;
}

void FindSSetEntries(Q_SSet *sset, int *zones)
/*  Find the sset entries contained in areas of the db listed
    in zones. overwrite zones with the numbers of the sset
    entries.
    */
{
  int		i;
  int		sq=0;

  for(i=1; i<zones[0]; i+=2) {
#ifdef SSET_DEBUG
    printf("Zone %d: %d - %d \n", i>>1, zones[i], zones[i+1]);
#endif
    while(End(sset, sq)<zones[i])
      sq++;
    zones[i] = sq;
    while(sq<sset->size-1 && End(sset, sq)<zones[i+1])
      sq++;
    zones[i+1] = sq;
#ifdef SSET_DEBUG
    printf("Zone %d: %d - %d \n", i>>1, zones[i], zones[i+1]);
#endif
  }
}


void PrintSSetInfo(Q_SSet *sset)
/*  Prints size and offsets stored in sset */
{
  int		i;

  printf("sequence set size: %d sequences \n", sset->size);
  for(i=0; i<sset->size; i++)
    printf("sequence %d starts at %d and ends at %d\n", 
	    i, sset->offsets[i], sset->offsets[i+1]-1);
}


void PrintSSet(Q_SSet *sset)
/*  Prints complete sequence set sset */
{
  int		base_counter = 0;
  int		sequence_counter = 0;
  
  PrintSSetInfo(sset);
  
  while(base_counter < sset->offsets[sset->size]) {
    if(base_counter == sset->offsets[sequence_counter]) {
      printf("\nSequence %d\n",sequence_counter);
      sequence_counter++;
    }
    printf("%c", sset->data[base_counter]);
    base_counter++;
  }
  printf("\n");
}

void WriteSSet(Q_SSet *sset, char *outfile)
/*  Writes all sequences in sset out to oufile in ASCII format */
{
  printf("Total number of sequences in sset: %d \n", sset->size);
  printf("Target output file: %s\n", outfile);
  printf("CHANGE!!! WRITE NOT IMPLEMENTED YET\n");
}


void PrintSSetEntry(Q_SSet *sset, int seq)
/*  Print the sequence with number seq in sset */
{
  int		i;
  int		bases;
  
#ifdef DEBUG
  if(seq<0 || seq>=sset->size) {
    printf("ERROR: seqeuence number out of range\n");
    getchar();
    return;
  }
#endif

  bases = sset->offsets[seq+1] - sset->offsets[seq] - 1;
  printf("\nSeq %d (%d bps)\n",seq, bases);

  for(i=Start(sset, seq); i<End(sset, seq); i++)
    printf("%c", sset->data[i]);
}


void PrintSSetEntryQgrams(Q_SSet *sset, int seq, int q)
/*  Print all q-grams in the sequence with number seq in sset */
{
  int		i;
  int		base_counter = 0;
  int		byte_counter = 0;
  int		mask = 3;
  int		bigmask;
  int		qgram = 0;
  unsigned int	c;
  
  
#ifdef DEBUG
  if(seq<0 || seq>=sset->size)
    printf("ERROR: seqeuence number out of range\n");
#endif

  printf("\nSeq %d (%d bps)\n",seq, sset->offsets[seq+1]-sset->offsets[seq]);
  byte_counter = sset->offsets[seq]>>2;
  bigmask = (1 << (2*q))-1;

  while(byte_counter != (1+(sset->offsets[seq+1]>>2))) {
    c = sset->data[byte_counter];
    for(i=3; i>=0; i--) {
      qgram += (c >> (2*i)) & mask;
      if(base_counter < sset->offsets[seq+1]-1 &&	/*CHANGE THE -1*/
	 base_counter >= sset->offsets[seq]+q-1) 
	printf("%u\n",qgram);
      qgram = (qgram << 2) & bigmask;
      base_counter++;
    }
    byte_counter++;
  }
  printf("\n");
}

void FastaToSSet(char *fastafile)
/*  Read a Fasta File named fastafile. Produce the following files:
    name.sset  :	sequence set 
    name.raw   :	raw data (sequences in ASCII, separated by SEPCHAR)
    name.headers :	headers (headers in ASCII) */
{
  FILE	  *infile;
  FILE    *rawfile;
  FILE    *offsetfile;
  FILE	  *headersfile;

  int     i;
  int     counter = 0;
  int     sequences = 0;
  int	  statistics[256];
  char	  s[STRING_BUF];
  char	  f[STRING_BUF];
  
  sprintf(f, "%s.fasta", fastafile);
  infile = MyFopen(f, "r");
  sprintf(f, "%s.raw", fastafile);
  rawfile = MyFopen(f, "w");
  sprintf(f, "%s.headers", fastafile);
  headersfile = MyFopen(f, "w");  
  offsetfile = MyFopen("tmp_offset_file", "w");

  fwrite(&counter, sizeof(int), 1, offsetfile);

  for(i=0; i<256; i++)
    statistics[i] = 0;      

  while(!feof(infile)) {
    if(fgets(s, STRING_BUF, infile) != NULL) {
      if(s[0] == '>') {
	if(counter > 0) {
	  fputc(SEP_CHAR, rawfile);
	  counter++;
	}
	fprintf(headersfile, "%s", s);
	fwrite(&counter, sizeof(int), 1, offsetfile);
	sequences++;
      }
      else {
	i=0;
	while(s[i] != '\n' && s[i] !=EOF)
	  {
	    if(s[i] == 'A' || s[i] == 'C' || s[i] == 'G' || s[i] == 'T'){
	      putc(s[i],rawfile);
	      statistics[(int)s[i]]++;
	      counter++; 
	    }
	    else {
              printf("ERROR: Invalid character !\n");
              printf("Character found : %d\n",(int)s[i]);
	      printf("Skipping character\n");
	    }
	    i++;
	  }
      }
    }
  }
  fputc(SEP_CHAR, rawfile);
  counter++;
  fwrite(&counter, sizeof(int), 1, offsetfile);
  rewind(offsetfile);
  fwrite(&sequences, sizeof(int), 1, offsetfile);
  
  for(i=0; i<256; i++)
    if(statistics[i] > 0)
      printf("%d: %d\n", i, statistics[i]);
  
  printf("Wrote %d byte of sequence data\n", counter);
  printf("Processed %d sequences\n", sequences);
  
  fclose(infile);
  fclose(rawfile);
  fclose(headersfile);
  fclose(offsetfile);
  
  sprintf(f, "cat tmp_offset_file %s.raw >%s.sset", fastafile, fastafile);
  printf("%s\n", f);
  
  system(f);
  system("rm tmp_offset_file");
}

