/*-----------------------------------------------------------------------------

   QUASAR - q-gram Alignment based on Suffix ARrays

   Copyright (C) 1998 Stefan Burkhardt
   Author: Stefan Burkhardt <stburk@mpi-sb.mpg.de>
   This file is part of the QUASAR package.

   QUASAR is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   QUASAR is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the QUASAR package; see the file copying.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.  or contact the author. 

-------------------------------------------------------------------------------

  Search Precomputing
  Also creates a compressed suffix array by removing invalid q-grams.
  
  $File$
  $Revision: 1.2 $
  $Date: Wed, 29 Mar 2000 11:07:45 +0200 $

call syntax :
q_pre db q-gram_length

-----------------------------------------------------------------------------*/

#include "q_pre.h"

int check_string(char *s, int q)
/*  check wether a string s has at least q letters of A, C, G or T
    returns 1 if correct, else 0 */
{
  int	i;
  for(i=0; i<q; i++) {
    if((s[i] != 'A') && (s[i] != 'C') && (s[i] != 'G') && (s[i] != 'T')) 
      return(0);
  }
  return(1);
}

int left_pos(char *s, int length, int *sa, int sa_len, Q_SSet *sset)
/*  Find the leftmost occurence of a suffix starting with the first length
    characters of s in the suffix array for the sequence set sset. Returns
    the first occurence */
{
  int l = 0;
  int r = sa_len;
  int m, diff;

  while(l <= r)
    {
      m = (l+r)>>1;

      diff = strncmp(sset->data+sa[m], s, length);

      if(diff >= 0)
	r = m-1;
      else
	l = m+1;
    }
  return(l);
}


int right_pos(char *s, int length, int *sa, int sa_len, Q_SSet *sset)
/*  Find the rightmost occurence of a suffix starting with the first length
    characters of s in the suffix array for the sequence set sset. Returns
    the first occurence of the next higher string */
{
  int l = 0;
  int r = sa_len;
  int m, diff;

  while(l <= r)
    {
      m = (l+r)>>1;

      diff = strncmp(sset->data+sa[m], s, length);

      if(diff <= 0)
	l = m+1;
      else
	r = m-1;
    }
  return(l);
}


int find_string(char *s, int *length, int *sa, int sa_len, Q_SSet *sset)
{
  int start, end;
  start = left_pos(s, *length, sa, sa_len, sset);
  end = right_pos(s, *length, sa, sa_len, sset);
#ifdef DEBUG
  if(0 == strncmp(s, "TTTTTTTTTTTTTTT", 11)){
    printf("Search String: %s, sa_len: %d\n", s, sa_len);
    printf("First hit at %d: %s\n", start, sset->data+sa[start]);
    printf("Last hit at %d: %s\n", end, sset->data+sa[end]);
    getchar();
  }
#endif
  *length = end - start;
  return(start);
}



/* MAIN PROGRAM  */
int main (int argc, char *argv[])
{
  Q_SSet	*db;
  int		*sa;

  off64_t	sa_len, csa_len=0;
  int		comb;
  int		q;
  int		i, j;
  int		start, length;
  char		alphabet[] = "ACGT";
  char	        buffer[1000];
  FILE	       *safile;
  FILE	       *csafile;
  FILE	       *srfile;

  printf("q_pre: precomputing suffix array searches $Revision: 1.2 $\n");
  printf("by Stefan Burkhardt, $Date: Wed, 29 Mar 2000 11:07:45 +0200 $\n");

  if(argc != 3)
    {
      printf("ERROR: wrong number of Arguments \n");
      printf("call syntax: q_pre dbname q \n");
      exit(-1);
    }

  /* READ CHOSEN Q-GRAM LENGTH */
  q = atoi(argv[2]);
  printf("Precomputing all searches for q-grams of length %d\n", q);

  /* READ REQUIRED DATA FROM DISK */
  db = ReadSSet(argv[1]);
  sprintf(buffer,"%s.sa",argv[1]);
  printf("Reading suffix array from %s\n", buffer);
  safile = MyFopen(buffer, "r");
  fseeko64(safile, 0, SEEK_END);
  sa_len = ftello64(safile);
  fseeko64(safile, 0, SEEK_SET);
  printf("Number of bytes in sa file: %lld\n", (LONG)sa_len);
  sa=MyMalloc(sa_len, "suffix array");
  MyFread(sa, (LONG)sa_len, safile);
  
  /* OPEN OUTPUTFILES */
  sprintf(buffer,"%s.sr%d",argv[1],q);
  printf("Writing search results to %s\n", buffer);
  srfile = MyFopen(buffer, "w");

  sprintf(buffer, "%s.csa%d", argv[1], q);
  printf("Writing compressed suffix array to %s\n", buffer);
  csafile = MyFopen(buffer, "w+");

  /* COMPRESS SUFFIX ARRAY */
  for(i=0; i<sa_len>>2; i++) {
#ifdef DEBUG
    printf("Checking %d: %.11s", i, db->data+sa[i]);
#endif
    if(check_string(db->data+sa[i], q)) {
#ifdef DEBUG
      printf("  +");
#endif
      fwrite(&sa[i], sizeof(int), 1, csafile);
      csa_len++;
    }
    if(i%1000 == 0)
      printf("Checked %d q-grams\n", i);
#ifdef DEBUG
    printf("\n");
#endif
  }
  printf("Removed %lld superfluous q-grams\n", (sa_len / sizeof(int))-csa_len);
  printf("Wrote %lld bytes to csa file \n", csa_len * sizeof(int));
  fseeko64(csafile, 0, SEEK_SET);
  MyFread(sa, csa_len * sizeof(int), csafile);

  /* SEARCH FOR ALL POSSIBLE Q-GRAMS */
  comb = 1<<(2*q);
  printf("Number of different %d-grams: %d\n", q, comb);

  buffer[q] = '\0';
  for(i=0; i<comb; i++)
    {
      for(j=0; j<q; j++)
	buffer[q-j-1] = alphabet[((int)(i/(1<<(2*j))))%4];
      length = q;
      start = find_string(buffer, &length, sa, csa_len, db);
      fwrite(&start, sizeof(int), 1, srfile);
      if((i % 100000) == 0)
	printf("%d searches completed, found %d %d-grams so far \n", i, start, q);
    }
  i = (int) csa_len;
  fwrite(&i, sizeof(int), 1, srfile);
  fclose(srfile);
  fclose(safile);
  fclose(csafile);

  return(0);
}
