libStatGen Software  1
GenomeSequence.cpp
1 /*
2  * Copyright (C) 2010-2012 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "assert.h"
19 #include "ctype.h"
20 #include "stdio.h"
21 #include "Error.h"
22 
23 
24 #include "Generic.h"
25 #include "GenomeSequence.h"
26 
27 #include <algorithm>
28 #include <istream>
29 #include <fstream>
30 #include <sstream>
31 #include <stdexcept>
32 
33 #if defined(_WIN32)
34 #include <io.h>
35 #ifndef R_OK
36 #define R_OK 4
37 #endif
38 #endif
39 
40 // not general use:
41 #include "CSG_MD5.h"
42 
43 //
44 // given a read in a string, pack it into the vector of
45 // bytes coded as two bases per byte.
46 //
47 // The goal is to allow us to more rapidly compare against
48 // the genome, which is itself packed 2 bases per byte.
49 //
50 // Unfortunately, the match position may be odd or even,
51 // so as a result, we also need to be able to prepad
52 // with 'N' bases to get the byte alignment the same, so
53 // padWithNCount may be a positive number indicating how
54 // many N bases to prepend.
55 //
56 void PackedRead::set(const char *rhs, int padWithNCount)
57 {
58  clear();
59 
60  // pad this packed read with 'N' bases two at a time
61  while (padWithNCount>1)
62  {
63  packedBases.push_back(
64  BaseAsciiMap::base2int[(int) 'N'] << 4 |
65  BaseAsciiMap::base2int[(int) 'N']
66  );
67  padWithNCount -= 2;
68  length+=2;
69  }
70 
71  // when we have only one base, pack one 'N' base with
72  // the first base in rhs if there is one.
73  if (padWithNCount)
74  {
75  // NB: *rhs could be NUL, which is ok here - just keep
76  // the length straight.
77  packedBases.push_back(
78  BaseAsciiMap::base2int[(int) *rhs] << 4 |
79  BaseAsciiMap::base2int[(int) 'N']
80  );
81  // two cases - have characters in rhs or we don't:
82  if (*rhs)
83  {
84  length+=2; // pad byte plus one byte from rhs
85  rhs++;
86  }
87  else
88  {
89  length++;
90  }
91  padWithNCount--; // should now be zero, so superfluous.
92  assert(padWithNCount==0);
93  }
94 
95  // pad pairs of bases from rhs, two at a time:
96  while (*rhs && *(rhs+1))
97  {
98  packedBases.push_back(
99  BaseAsciiMap::base2int[(int) *(rhs+1)] << 4 |
100  BaseAsciiMap::base2int[(int) *(rhs+0)]
101  );
102  rhs+=2;
103  length+=2;
104  }
105 
106  // if there is an odd base left at the end, put it
107  // in a byte all its own (low 4 bits == 0):
108  if (*rhs)
109  {
110  packedBases.push_back(
111  BaseAsciiMap::base2int[(int) *(rhs+0)]
112  );
113  length++;
114  }
115  return;
116 }
117 
118 std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize) const
119 {
120  std::string sequence("");
121  for (unsigned int i = 0; i < wordsize; i ++)
122  sequence += "N";
123 
124  unsigned int clearHigherBits = ~(3U << (wordsize<<1)); // XXX refactor - this appears several places
125 
126  if (n > clearHigherBits)
127  error("%d needs to be a non-negative integer < clearHigherBits\n", n);
128 
129  for (unsigned int i = 0; i < wordsize; i++)
130  {
131  sequence[wordsize-1-i] = BaseAsciiMap::int2base[n & 3];
132  n >>= 2;
133  }
134  return sequence;
135 }
136 
137 
138 
140 {
141  constructorClear();
142 }
143 
144 void GenomeSequence::constructorClear()
145 {
146  _debugFlag = 0;
147  _progressStream = NULL;
148  _colorSpace = false;
149  _createOverwrite = false;
150 }
151 
152 void GenomeSequence::setup(const char *referenceFilename)
153 {
154  setReferenceName(referenceFilename);
155 
156  if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush;
157 
158  if (open(false))
159  {
160  std::cerr << "Failed to open reference genome " << referenceFilename << std::endl;
161  std::cerr << errorStr << std::endl;
162  exit(1);
163  }
164 
165  prefetch();
166  if (_progressStream) *_progressStream << "done." << std::endl << std::flush;
167 }
168 
170 {
171  // free up resources:
172  _umfaFile.close();
173 }
174 
175 //
176 // mapped open.
177 //
178 // if the file exists, map in into memory, and fill in a few useful
179 // fields.
180 //
181 
182 bool GenomeSequence::open(bool isColorSpace, int flags)
183 {
184  bool rc;
185 
186  if (isColorSpace)
187  {
188  _umfaFilename = _baseFilename + "-cs.umfa";
189  }
190  else
191  {
192  _umfaFilename = _baseFilename + "-bs.umfa";
193  }
194 
195  if(access(_umfaFilename.c_str(), R_OK) != 0)
196  {
197  // umfa file doesn't exist, so try to create it.
198  if(create(isColorSpace))
199  {
200  // Couldon't access or create the umfa.
201  std::cerr << "GenomeSequence::open: failed to open file "
202  << _umfaFilename
203  << " also failed creating it."
204  << std::endl;
205  return true;
206  }
207  }
208 
209  rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
210  if (rc)
211  {
212  std::cerr << "GenomeSequence::open: failed to open file "
213  << _umfaFilename
214  << std::endl;
215  return true;
216  }
217 
218  _colorSpace = header->_colorSpace;
219 
220  return false;
221 }
222 
223 void GenomeSequence::sanityCheck(MemoryMap &fasta) const
224 {
225  unsigned int i;
226 
227  unsigned int genomeIndex = 0;
228  for (i=0; i<fasta.length(); i++)
229  {
230  switch (fasta[i])
231  {
232  case '>':
233  while (fasta[i]!='\n' && fasta[i]!='\r') i++;
234  break;
235  case '\n':
236  case '\r':
237  break;
238  default:
239  assert(BaseAsciiMap::base2int[(int)(*this)[genomeIndex]] ==
240  BaseAsciiMap::base2int[(int) fasta[i]]);
241  genomeIndex++;
242  break;
243  }
244  }
245 }
246 
247 #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix))
248 //
249 // referenceFilename is either a fasta or a UM fasta (.fa or .umfa)
250 // filename. In both cases, the suffix gets removed and the
251 // base name is kept for later use depending on context.
252 // @return always return false
253 //
254 bool GenomeSequence::setReferenceName(std::string referenceFilename)
255 {
256 
257  if (HAS_SUFFIX(referenceFilename, ".fa"))
258  {
259  _referenceFilename = referenceFilename;
260  _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
261  }
262  else if (HAS_SUFFIX(referenceFilename, ".umfa"))
263  {
264  _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
265  }
266  else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
267  {
268  _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
269  }
270  else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
271  {
272  _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
273  }
274  else
275  {
276  _baseFilename = referenceFilename;
277  }
278  _fastaFilename = _baseFilename + ".fa";
279 
280  if (HAS_SUFFIX(referenceFilename, ".fasta"))
281  {
282  _referenceFilename = referenceFilename;
283  _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 6);
284  _fastaFilename = _baseFilename + ".fasta";
285  }
286 
287  return false;
288 }
289 
290 //
291 // this works in lockstep with ::create to populate
292 // the per chromosome header fields size and md5
293 // checksum.
294 //
295 // It relies on header->elementCount being set to
296 // the data length loaded so far ... not the ultimate
297 // reference length.
298 //
299 bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome)
300 {
301  if (whichChromosome>=header->_chromosomeCount) return true;
302 
303  ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
304  c->size = header->elementCount - c->start;
305 
306  MD5_CTX md5Context;
307  uint8_t md5Signature[MD5_DIGEST_LENGTH];
308 
309  //
310  // it's easier to ensure we do this right if we just do it
311  // in one big chunk:
312  //
313  char *md5Buffer = (char *) malloc(c->size);
314 
315  MD5Init(&md5Context);
316 
317  for (genomeIndex_t i = 0; i < c->size; i ++)
318  {
319  md5Buffer[i] = (*this)[c->start + i];
320  }
321  MD5Update(&md5Context, (unsigned char *) md5Buffer, c->size);
322  MD5Final((unsigned char *) &md5Signature, &md5Context);
323  free(md5Buffer);
324  for (int i=0; i<MD5_DIGEST_LENGTH; i++)
325  {
326  // cheesy but works:
327  sprintf(c->md5+2*i, "%02x", md5Signature[i]);
328  }
329  // redundant, strictly speaking due to sprintf NUL terminating
330  // it's output strings, but put it here anyway.
331  c->md5[2*MD5_DIGEST_LENGTH] = '\0';
332 
333  return false;
334 }
335 
336 //
337 // Given a buffer with a fasta format contents, count
338 // the number of chromosomes in it and return that value.
339 //
340 static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
341 {
342  chromosomeCount = 0;
343  baseCount = 0;
344  bool atLineStart = true;
345 
346  //
347  // loop over the fasta file, essentially matching for the
348  // pattern '^>.*$' and counting them.
349  //
350  for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
351  {
352  switch (fastaData[fastaIndex])
353  {
354  case '\n':
355  case '\r':
356  atLineStart = true;
357  break;
358  case '>':
359  {
360  if (!atLineStart) break;
361  chromosomeCount++;
362  //
363  // eat the rest of the line
364  //
365  while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
366  {
367  fastaIndex++;
368  }
369  break;
370  }
371  default:
372  baseCount++;
373  atLineStart = false;
374  break;
375  }
376 
377  }
378  return false;
379 }
380 
381 class PackedSequenceData : public std::vector<uint8_t>
382 {
383  std::vector<uint8_t> m_packedBases;
384 
385  size_t m_baseCount;
386 
387  void set(size_t index, uint8_t value) {
388  m_packedBases[index>>1] =
389  (m_packedBases[index>>1] // original value
390  & ~(7<<((index&0x01)<<2))) // logical AND off the original value
391  | ((value&0x0f)<<((index&0x1)<<2)); // logical OR in the new value
392  }
393 
394 public:
395 
396  void reserve(size_t baseCount) {m_packedBases.reserve(baseCount/2);}
397  size_t size() {return m_baseCount;}
398  void clear() {m_packedBases.clear(); m_baseCount = 0;}
399  uint8_t operator [](size_t index)
400  {
401  return (m_packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
402  }
403  void push_back(uint8_t base);
404 };
405 
406 //
407 // Load a fasta format file from filename into the buffer
408 // provided by the caller.
409 // While parsing the fasta file, record each chromosome name,
410 // its start location, and its size.
411 //
412 // NB: the caller must implement the logic to determine how
413 // large the sequence data is. There is no correct way to do
414 // this, because we can't reliably estimate here how much sequence
415 // data is contained in a compressed file.
416 //
417 // To safely pre-allocate space in sequenceData, use the reserve() method
418 // before calling this function.
419 //
420 bool loadFastaFile(const char *filename,
421  std::vector<PackedSequenceData> &sequenceData,
422  std::vector<std::string> &chromosomeNames)
423 {
424  InputFile inputStream(filename, "r", InputFile::DEFAULT);
425 
426  if(!inputStream.isOpen()) {
427  std::cerr << "Failed to open file " << filename << "\n";
428  return true;
429  }
430 
431  int whichChromosome = -1;
432  chromosomeNames.clear();
433 
434  char ch;
435  while((ch = inputStream.ifgetc()) != EOF) {
436  switch (ch)
437  {
438  case '\n':
439  case '\r':
440  break;
441  case '>':
442  {
443  std::string chromosomeName = "";
444  //
445  // pull out the chromosome new name
446  //
447  while (!isspace((ch = inputStream.ifgetc())) && ch != EOF)
448  {
449  chromosomeName += ch; // slow, but who cares
450  }
451  //
452  // eat the rest of the line
453  //
454  do {
455  ch = inputStream.ifgetc();
456  } while(ch != EOF && ch != '\n' && ch != '\r');
457  //
458  // save the Chromosome name and index into our
459  // header so we can use them later.
460  //
461  chromosomeNames.push_back(chromosomeName);
462 
463  whichChromosome++;
464 
465  break;
466  }
467  default:
468  // we get here for sequence data.
469  //
470  // save the base value
471  // Note: invalid characters come here as well, but we
472  // let ::set deal with mapping them.
473  break;
474  }
475  }
476  return false;
477 }
478 
479 //
480 // recreate the umfa file from a reference fasta format file
481 //
482 // The general format of a FASTA file is best described
483 // on wikipedia at http://en.wikipedia.org/wiki/FASTA_format
484 //
485 // The format parsed here is a simpler subset, and is
486 // described here http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
487 //
488 bool GenomeSequence::create(bool isColor)
489 {
490  setColorSpace(isColor);
491 
492  if (_baseFilename=="")
493  {
494  std::cerr << "Base reference filename is empty." << std::endl;
495  return true;
496  }
497 
498  if (isColorSpace())
499  {
500  _umfaFilename = _baseFilename + "-cs.umfa";
501  }
502  else
503  {
504  _umfaFilename = _baseFilename + "-bs.umfa";
505  }
506 
507  if (!_createOverwrite && access(_umfaFilename.c_str(), R_OK) == 0)
508  {
509  std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl;
510  return true;
511  }
512 
513  MemoryMap fastaFile;
514 
515  if (fastaFile.open(_fastaFilename.c_str()))
516  {
517  std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl;
518  return true;
519  }
520 
521  std::cerr << "Creating FASTA "
522  << (isColorSpace() ? "color space " : "")
523  << "binary cache file '"
524  << _umfaFilename
525  << "'."
526  << std::endl;
527 
528  std::cerr << std::flush;
529 
530  //
531  // simple ptr to fasta data -- just treat the memory map
532  // as an array of fastaDataSize characters...
533  //
534  const char *fasta = (const char *) fastaFile.data;
535  size_t fastaDataSize = fastaFile.length();
536 
537  uint32_t chromosomeCount = 0;
538  uint64_t baseCount = 0;
539  getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
540 
541  if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount))
542  {
543  std::cerr << "failed to create '"
544  << _umfaFilename
545  << "'."
546  << std::endl;
547  perror("");
548  return true;
549  }
550  header->elementCount = 0;
551  header->_colorSpace = isColorSpace();
552  header->setApplication(_application.c_str());
553  header->_chromosomeCount = chromosomeCount;
554  //
555  // clear out the variable length chromosome info array
556  //
557  for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
558 
559  std::string chromosomeName;
560 
561  //
562  // for converting the reference to colorspace, the first base is always 5 (in base space it is 'N')
563  signed char lastBase = BaseAsciiMap::base2int[(int) 'N'];
564  bool terminateLoad = false;
565  int percent = -1, newPercent;
566  uint32_t whichChromosome = 0;
567  for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
568  {
569  if (_progressStream)
570  {
571  newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100;
572  if (newPercent>percent)
573  {
574  *_progressStream << "\r" << newPercent << "% ";
575  *_progressStream << std::flush;
576  percent = newPercent;
577  }
578  }
579  switch (fasta[fastaIndex])
580  {
581  case '\n':
582  case '\r':
583  break;
584  case '>':
585  {
586  chromosomeName = "";
587  fastaIndex++; // skip the > char
588  //
589  // pull out the chromosome new name
590  //
591  while (!isspace(fasta[fastaIndex]))
592  {
593  chromosomeName += fasta[fastaIndex++]; // slow, but who cares
594  }
595  //
596  // eat the rest of the line
597  //
598  while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r')
599  {
600  fastaIndex++;
601  }
602  //
603  // save the Chromosome name and index into our
604  // header so we can use them later.
605  //
606  ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
607  c->setChromosomeName(chromosomeName.c_str());
608  c->start = header->elementCount;
609  // c->size gets computed at the next '>' line or at the EOF
610 
611  if (whichChromosome>0)
612  {
613  //
614  // compute md5 checksum for the chromosome that we just
615  // loaded (if there was one) - note that on the last
616  // chromosome, we have to duplicate this code after
617  // the end of this loop
618  //
619  setChromosomeMD5andLength(whichChromosome - 1);
620  }
621  whichChromosome++;
622  if (whichChromosome > header->_chromosomeCount)
623  {
624  std::cerr << "BUG: Exceeded computed chromosome count ("
625  << header->_chromosomeCount
626  << ") - genome is now truncated at chromosome "
627  << header->_chromosomes[header->_chromosomeCount-1].name
628  << " (index "
629  << header->_chromosomeCount
630  << ")."
631  << std::endl;
632  terminateLoad = true;
633  }
634  break;
635  }
636  default:
637  // save the base pair value
638  // Note: invalid characters come here as well, but we
639  // let ::set deal with mapping them.
640  if (isColorSpace())
641  {
642 //
643 // anything outside these values represents an invalid base
644 // base codes: 0-> A, 1-> C, 2-> G, 3-> T
645 // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red
646 //
647  const char fromBase2CS[] =
648  {
649  /* 0000 */ 0, // A->A
650  /* 0001 */ 1, // A->C
651  /* 0010 */ 2, // A->G
652  /* 0011 */ 3, // A->T
653  /* 0100 */ 1, // C->A
654  /* 0101 */ 0, // C->C
655  /* 0110 */ 3, // C->G
656  /* 0111 */ 2, // C->T
657  /* 1000 */ 2, // G->A
658  /* 1001 */ 3, // G->C
659  /* 1010 */ 0, // G->G
660  /* 1011 */ 1, // G->T
661  /* 1100 */ 3, // T->A
662  /* 1101 */ 2, // T->C
663  /* 1110 */ 1, // T->G
664  /* 1111 */ 0, // T->T
665  };
666  //
667  // we are writing color space values on transitions,
668  // so we don't write a colorspace value when we
669  // get the first base value.
670  //
671  // On second and subsequent bases, write based on
672  // the index table above
673  //
674  char thisBase =
675  BaseAsciiMap::base2int[(int)(fasta[fastaIndex])];
676  if (lastBase>=0)
677  {
678  char color;
679  if (lastBase>3 || thisBase>3) color=4;
680  else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
681  // re-use the int to base, because ::set expects a base char (ATCG), not
682  // a color code (0123). It should only matter on final output.
683  set(header->elementCount++,
684  BaseAsciiMap::int2base[(int) color]);
685  }
686  lastBase = thisBase;
687  }
688  else
689  {
690  set(header->elementCount++, toupper(fasta[fastaIndex]));
691  }
692  break;
693  }
694 
695  //
696  // slightly awkward exit handling when we exceed the fixed
697  // number of chromosomes
698  //
699  if (terminateLoad) break;
700  }
701 
702  //
703  // also slightly awkward code to handle the last dangling chromosome...
704  // all we should need to do is compute the md5 checksum
705  //
706  if (whichChromosome==0)
707  {
708  fastaFile.close();
709  throw std::runtime_error("No chromosomes found - aborting!");
710  }
711  else
712  {
713  setChromosomeMD5andLength(whichChromosome-1);
714  }
715 
716  fastaFile.close();
717 
718  if (_progressStream) *_progressStream << "\r";
719 
720  std::cerr << "FASTA binary cache file '"
721  << _umfaFilename
722  << "' created."
723  << std::endl;
724 
725  //
726  // leave the umfastaFile open in case caller wants to use it
727  //
728  return false;
729 }
730 
732 {
733  return header->_chromosomeCount;
734 }
735 
736 //return chromosome index: 0, 1, ... 24;
737 int GenomeSequence::getChromosome(genomeIndex_t position) const
738 {
739  if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
740 
741  if (header->_chromosomeCount == 0)
742  return INVALID_CHROMOSOME_INDEX;
743 
744  int start = 0;
745  int stop = header->_chromosomeCount - 1;
746 
747  // eliminate case where position is in the last chromosome, since the loop
748  // below falls off the end of the list if it in the last one.
749 
750  if (position > header->_chromosomes[stop].start)
751  return (stop);
752 
753  while (start <= stop)
754  {
755  int middle = (start + stop) / 2;
756 
757  if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
758  return middle;
759 
760  if (position == header->_chromosomes[middle + 1].start)
761  return (middle + 1);
762 
763  if (position > header->_chromosomes[middle + 1].start)
764  start = middle + 1;
765 
766  if (position < header->_chromosomes[middle].start)
767  stop = middle - 1;
768  }
769 
770  return -1;
771 }
772 
773 //
774 // Given a chromosome name and 1-based chromosome index, return the
775 // genome index (0 based) into sequence for it.
776 //
777 // NB: the header->chromosomes array contains zero based genome positions
778 //
780  const char *chromosomeName,
781  unsigned int chromosomeIndex) const
782 {
783  genomeIndex_t i = getGenomePosition(chromosomeName);
784  if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
785  return i + chromosomeIndex - 1;
786 }
787 
789  int chromosome,
790  unsigned int chromosomeIndex) const
791 {
792  if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
793 
794  genomeIndex_t i = header->_chromosomes[chromosome].start;
795  if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
796  return i + chromosomeIndex - 1;
797 }
798 
799 //
800 // return the genome index (0 based) of the start of the named
801 // chromosome. If none is found, INVALID_GENOME_INDEX is returned.
802 //
803 // XXX may need to speed this up - and smarten it up with some
804 // modest chromosome name parsing.... e.g. '%d/X/Y' or 'chr%d/chrX/chrY' or
805 // other schemes.
806 //
807 genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName) const
808 {
809  int chromosome = getChromosome(chromosomeName);
810  if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
811  return header->_chromosomes[chromosome].start;
812 }
813 
814 int GenomeSequence::getChromosome(const char *chromosomeName) const
815 {
816  unsigned int i;
817  for (i=0; i<header->_chromosomeCount; i++)
818  {
819  if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
820  {
821  return i;
822  }
823  }
824  return INVALID_CHROMOSOME_INDEX;
825 }
826 
827 //
828 // Given a read, reverse the string and swap the base
829 // pairs for the reverse strand equivalents.
830 //
831 void GenomeSequence::getReverseRead(std::string &read)
832 {
833  std::string newRead;
834  if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--)
835  {
836  newRead.push_back(BasePair(read[i]));
837  }
838  read = newRead;
839 }
840 
841 void GenomeSequence::getReverseRead(String& read)
842 {
843  int i = 0;
844  int j = read.Length()-1;
845  char temp;
846  while (i < j)
847  {
848  temp = read[j];
849  read[j] = read[i];
850  read[i] = temp;
851  }
852 }
853 
854 #define ABS(x) ( (x) > 0 ? (x) : -(x) )
855 int GenomeSequence::debugPrintReadValidation(
856  std::string &read,
857  std::string &quality,
858  char direction,
859  genomeIndex_t readLocation,
860  int sumQuality,
861  int mismatchCount,
862  bool recurse
863 )
864 {
865  int validateSumQ = 0;
866  int validateMismatchCount = 0;
867  int rc = 0;
868  std::string genomeData;
869 
870  for (uint32_t i=0; i<read.size(); i++)
871  {
872  if (tolower(read[i]) != tolower((*this)[readLocation + i]))
873  {
874  validateSumQ += quality[i] - '!';
875  // XXX no longer valid:
876  if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
877  genomeData.push_back(tolower((*this)[readLocation + i]));
878  }
879  else
880  {
881  genomeData.push_back(toupper((*this)[readLocation + i]));
882  }
883  }
884  assert(validateSumQ>=0);
885  if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
886  {
887  printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n",
888  genomeData.c_str(),
889  read.c_str(),
890  validateSumQ,
891  sumQuality
892  );
893  rc++;
894  }
895  else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
896  {
897  printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n",
898  genomeData.c_str(),
899  read.c_str(),
900  validateMismatchCount,
901  mismatchCount
902  );
903  rc++;
904  }
905  else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
906  {
907  printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
908  genomeData.c_str(),
909  read.c_str(),
910  validateSumQ,
911  sumQuality,
912  validateMismatchCount,
913  mismatchCount
914  );
915  rc++;
916  }
917 
918  if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2)
919  {
920  printf("large mismatch difference, trying reverse strand: ");
921  std::string reverseRead = read;
922  std::string reverseQuality = quality;
923  getReverseRead(reverseRead);
924  reverse(reverseQuality.begin(), reverseQuality.end());
925  rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
926  }
927  return rc;
928 }
929 #undef ABS
930 
931 
932 bool GenomeSequence::wordMatch(unsigned int index, std::string &word) const
933 {
934  for (uint32_t i = 0; i<word.size(); i++)
935  {
936  if ((*this)[index + i] != word[i]) return false;
937  }
938  return true;
939 }
940 
941 bool GenomeSequence::printNearbyWords(unsigned int index, unsigned int deviation, std::string &word) const
942 {
943  for (unsigned int i = index - deviation; i < index + deviation; i++)
944  {
945  if (wordMatch(i, word))
946  {
947  std::cerr << "word '"
948  << word
949  << "' found "
950  << i - index
951  << " away from position "
952  << index
953  << "."
954  << std::endl;
955  }
956  }
957  return false;
958 }
959 
960 void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file) const
961 {
962  for (unsigned int i=0; i<header->_chromosomeCount; i++)
963  {
964  file
965  << "@SQ"
966  << "\tSN:" << header->_chromosomes[i].name // name
967  << "\tLN:" << header->_chromosomes[i].size // number of bases
968  << "\tAS:" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
969  << "\tM5:" << header->_chromosomes[i].md5
970  << "\tUR:" << header->_chromosomes[i].uri
971  << "\tSP:" << header->_chromosomes[i].species // e.g. Homo_sapiens
972  << std::endl;
973  }
974 }
975 
976 void GenomeSequence::dumpHeaderTSV(std::ostream &file) const
977 {
978  file << "# Reference: " << _baseFilename << std::endl;
979  file << "# SN: sample name - must be unique" << std::endl;
980  file << "# AS: assembly name" << std::endl;
981  file << "# SP: species" << std::endl;
982  file << "# LN: chromosome/contig length" << std::endl;
983  file << "# M5: chromosome/contig MD5 checksum" << std::endl;
984  file << "# LN and M5 are only printed for informational purposes." << std::endl;
985  file << "# Karma will only set those values when creating the index." << std::endl;
986  file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl;
987  for (unsigned int i=0; i<header->_chromosomeCount; i++)
988  {
989  file
990  << header->_chromosomes[i].name // name
991  << "\t" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
992  << "\t" << header->_chromosomes[i].uri
993  << "\t" << header->_chromosomes[i].species // e.g. Homo_sapiens
994  << "\t" << header->_chromosomes[i].size // number of bases
995  << "\t" << header->_chromosomes[i].md5
996  << std::endl;
997  }
998 }
999 
1000 
1001 void GenomeSequence::getString(std::string &str, int chromosome, uint32_t index, int baseCount) const
1002 {
1003  //
1004  // calculate the genome index for the lazy caller...
1005  //
1006  genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
1007 
1008  getString(str, genomeIndex, baseCount);
1009 }
1010 
1011 void GenomeSequence::getString(String &str, int chromosome, uint32_t index, int baseCount) const
1012 {
1013  std::string string;
1014  this-> getString(string, chromosome, index, baseCount);
1015  str = string.c_str();
1016 }
1017 
1018 void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount) const
1019 {
1020  str.clear();
1021  if (baseCount > 0)
1022  {
1023  for (int i=0; i<baseCount; i++)
1024  {
1025  str.push_back((*this)[index + i]);
1026  }
1027  }
1028  else
1029  {
1030  // if caller passed negative basecount, give them
1031  // the read for the 3' end
1032  for (int i=0; i< -baseCount; i++)
1033  {
1034  str.push_back(BaseAsciiMap::base2complement[(int)(*this)[index + i]]);
1035  }
1036  }
1037 }
1038 
1039 void GenomeSequence::getString(String &str, genomeIndex_t index, int baseCount) const
1040 {
1041  std::string string;
1042  getString(string, index, baseCount);
1043  str = string.c_str();
1044 }
1045 
1046 void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const
1047 {
1048  str.clear();
1049  if (baseCount > 0)
1050  {
1051  for (int i=0; i<baseCount; i++)
1052  {
1053  char base = (*this)[index + i];
1054  if (in(index+i, highLightStart, highLightEnd))
1055  base = tolower(base);
1056  str.push_back(base);
1057  }
1058  }
1059  else
1060  {
1061  // if caller passed negative basecount, give them
1062  // the read for the 3' end
1063  for (int i=0; i< -baseCount; i++)
1064  {
1065  char base = BaseAsciiMap::base2complement[(int)(*this)[index + i]];
1066  if (in(index+i, highLightStart, highLightEnd))
1067  base = tolower(base);
1068  str.push_back(base);
1069  }
1070  }
1071 }
1072 
1073 void GenomeSequence::print30(genomeIndex_t index) const
1074 {
1075  std::cout << "index: " << index << "\n";
1076  for (genomeIndex_t i=index-30; i<index+30; i++)
1077  std::cout << (*this)[i];
1078  std::cout << "\n";
1079  for (genomeIndex_t i=index-30; i<index; i++)
1080  std::cout << " ";
1081  std::cout << "^";
1082  std::cout << std::endl;
1083 }
1084 
1085 void GenomeSequence::getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const
1086 {
1087  result = "";
1088  for (uint32_t i=0; i < read.size(); i++)
1089  {
1090  if (read[i] == (*this)[location+i])
1091  result.push_back(' ');
1092  else
1093  result.push_back('^');
1094  }
1095 }
1096 
1097 void GenomeSequence::getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const
1098 {
1099  result = "";
1100  for (uint32_t i=0; i < read.size(); i++)
1101  {
1102  if (read[i] == (*this)[location+i])
1103  result.push_back(toupper(read[i]));
1104  else
1105  result.push_back(tolower(read[i]));
1106  }
1107 }
1108 
1109 genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const
1110 {
1111  int bestScore = 1000000; // either mismatch count or sumQ
1112  genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
1113  for (int i=-windowSize; i<windowSize; i++)
1114  {
1115  int newScore;
1116 
1117  if (i<0 && ((uint32_t) -i) > index) continue;
1118  if (index + i + read.size() >= getNumberBases()) continue;
1119 
1120  if (quality=="")
1121  {
1122  newScore = this->getMismatchCount(read, index + i);
1123  }
1124  else
1125  {
1126  newScore = this->getSumQ(read, quality, index + i);
1127  }
1128  if (newScore < bestScore)
1129  {
1130  bestScore = newScore;
1131  bestMatchLocation = index + i;
1132  }
1133  }
1134  return bestMatchLocation;
1135 }
1136 
1137 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h)
1138 {
1139  stream << (MemoryMapArrayHeader &) h;
1140  stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
1141  stream << "isColorSpace: " << h._colorSpace << std::endl;
1142  stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
1143  uint64_t totalSize = 0;
1144  for (uint32_t i=0; i < h._chromosomeCount; i++)
1145  {
1146  totalSize += h._chromosomes[i].size;
1147  stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl;
1148  stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl;
1149  stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl;
1150  stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl;
1151  stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl;
1152  stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl;
1153  stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl;
1154  }
1155  stream << "Total Genome Size: " << totalSize << " bases."<< std::endl;
1156  if (totalSize != h.elementCount)
1157  {
1158  stream << "Total Genome Size: does not match elementCount!\n";
1159  }
1160 
1161  stream << std::endl;
1162  return stream;
1163 }
1164 
1165 void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i) const
1166 {
1167  int whichChromosome = 0;
1168 
1169  whichChromosome = getChromosome(i);
1170 
1171  if (whichChromosome == INVALID_CHROMOSOME_INDEX)
1172  {
1173  s = "invalid genome index"; // TODO include the index in error
1174  }
1175  else
1176  {
1177  std::ostringstream buf;
1178  genomeIndex_t chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1;
1179  buf << header->_chromosomes[whichChromosome].name << ":" << chromosomeIndex;
1180 #if 0
1181  buf << " (GenomeIndex " << i << ")";
1182 #endif
1183  s = buf.str();
1184  }
1185  return;
1186 }
1187 
1188 
1189 void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i) const
1190 {
1191  std::string ss;
1192  getChromosomeAndIndex(ss, i);
1193  s = ss.c_str();
1194  return;
1195 }
1196 
1197 
1198 //
1199 // This is intended to be a helper routine to get dbSNP files
1200 // loaded. In some cases, we will load into an mmap() file (ie
1201 // when we are creating it), in others, we will simply be loading
1202 // an existing dbSNP file into RAM (when the binary file does not
1203 // exist or when we are running with useMemoryMapFlag == false.
1204 //
1205 // Assume that dbSNP exists, is writable, and is the right size.
1206 //
1207 // Using the dbSNPFilename given, mark each dbSNP position
1208 // with a bool true.
1209 //
1210 // Return value:
1211 // True: if populateDBSNP() succeed
1212 // False: if not succeed
1213 bool GenomeSequence::populateDBSNP(
1214  mmapArrayBool_t &dbSNP,
1215  IFILE inputFile) const
1216 {
1217  assert(dbSNP.getElementCount() == getNumberBases());
1218 
1219  if(inputFile == NULL)
1220  {
1221  // FAIL, file not opened.
1222  return(false);
1223  }
1224 
1225  std::string chromosomeName;
1226  std::string position;
1227  genomeIndex_t chromosomePosition1; // 1-based
1228  uint64_t ignoredLineCount = 0;
1229 
1230  // Read til the end of the file.
1231  char* postPosPtr = NULL;
1232  while(!inputFile->ifeof())
1233  {
1234  chromosomeName.clear();
1235  position.clear();
1236  // Read the chromosome
1237  if(inputFile->readTilTab(chromosomeName) <= 0)
1238  {
1239  // hit either eof or end of line, check if
1240  // it is a header.
1241  if(chromosomeName.size()>0 && chromosomeName[0]=='#')
1242  {
1243  // header, so just continue.
1244  continue;
1245  }
1246  // Not the header, so this line is poorly formatted.
1247  ++ignoredLineCount;
1248  // Continue to the next line.
1249  continue;
1250  }
1251 
1252  // Check if it is a header line.
1253  if(chromosomeName.size()>0 && chromosomeName[0]=='#')
1254  {
1255  // did not hit eof or end of line,
1256  // so discard the rest of the line.
1257  inputFile->discardLine();
1258  continue;
1259  }
1260 
1261  // Not a header, so read the position.
1262  if(inputFile->readTilTab(position) > 0)
1263  {
1264  // Additional data on the line, so discard it.
1265  inputFile->discardLine();
1266  }
1267 
1268  // Convert the position to a string.
1269  chromosomePosition1 = strtoul(position.c_str(), &postPosPtr, 0);
1270 
1271 
1272  if(postPosPtr == position.c_str())
1273  {
1274  ++ignoredLineCount;
1275  continue;
1276  }
1277 
1278  // 1-based genome index.
1279  genomeIndex_t genomeIndex =
1280  getGenomePosition(chromosomeName.c_str(), chromosomePosition1);
1281 
1282  // if the genome index is invalid, ignore it
1283  if((genomeIndex == INVALID_GENOME_INDEX) ||
1284  (genomeIndex > getNumberBases()))
1285  {
1286  ignoredLineCount++;
1287  continue;
1288  }
1289 
1290  dbSNP.set(genomeIndex, true);
1291  }
1292 
1293  if (ignoredLineCount > 0)
1294  {
1295  std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl;
1296  return false;
1297  }
1298  return true;
1299 }
1300 
1302  mmapArrayBool_t &dbSNP,
1303  const char *inputFileName) const
1304 {
1305  //
1306  // the goal in this section of code is to allow the user
1307  // to either specify a valid binary version of the SNP file,
1308  // or the original text file that it gets created from.
1309  //
1310  // To do this, we basically open, sniff the error message,
1311  // and if it claims it is not a binary version of the file,
1312  // we go ahead and treat it as the text file and use the
1313  // GenomeSequence::populateDBSNP method to load it.
1314  //
1315  // Further checking is really needed to ensure users don't
1316  // mix a dbSNP file for a different reference, since it is really
1317  // easy to do.
1318  //
1319  if (strlen(inputFileName)!=0)
1320  {
1321  std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
1322 
1323  if (dbSNP.open(inputFileName, O_RDONLY))
1324  {
1325  //
1326  // failed to open, possibly due to bad magic.
1327  //
1328  // this is really awful ... need to have a return
1329  // code that is smart enough to avoid this ugliness:
1330  //
1331  if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
1332  {
1333  std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
1334  exit(1);
1335  }
1336  //
1337  // we have a file, assume we can load it as a text file
1338  //
1339  IFILE inputFile = ifopen(inputFileName, "r");
1340  if(inputFile == NULL)
1341  {
1342  std::cerr << "Error: failed to open " << inputFileName << std::endl;
1343  exit(1);
1344  }
1345 
1346  std::cerr << "(as text file) ";
1347 
1348  // anonymously (RAM resident only) create:
1349  dbSNP.create(getNumberBases());
1350 
1351  // now load it into RAM
1352  populateDBSNP(dbSNP, inputFile);
1353  ifclose(inputFile);
1354 
1355  }
1356  else
1357  {
1358  std::cerr << "(as binary mapped file) ";
1359  }
1360 
1361  std::cerr << "DONE!" << std::endl;
1362  return false;
1363  }
1364  else
1365  {
1366  return true;
1367  }
1368 }
1369 
1370 
1371 #if defined(TEST)
1372 
1373 void simplestExample(void)
1374 {
1375  GenomeSequence reference;
1376  genomeIndex_t index;
1377 
1378  // a particular reference is set by:
1379  // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa")
1380  //
1381  // In the above example, the suffix .fa is stripped and replaced with .umfa,
1382  // which contains the actual file being opened.
1383  //
1384  if (reference.open())
1385  {
1386  perror("GenomeSequence::open");
1387  exit(1);
1388  }
1389 
1390 
1391  index = 1000000000; // 10^9
1392  //
1393  // Write the base at the given index. Here, index is 0 based,
1394  // and is across the whole genome, as all chromosomes are sequentially
1395  // concatenated, so the allowed range is
1396  //
1397  // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last))
1398  //
1399  // (where int last = reference.getChromosomeCount() - 1;)
1400  //
1401  std::cout << "base[" << index << "] = " << reference[index] << std::endl;
1402 
1403  //
1404  // Example for finding chromosome and one based chromosome position given
1405  // and absolute position on the genome in 'index':
1406  //
1407  int chr = reference.getChromosome(index);
1408  genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1; // 1-based
1409 
1410  std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
1411 
1412  //
1413  // Example for finding an absolute genome index position when the
1414  // chromosome name and one based position are known:
1415  //
1416  const char *chromosomeName = "5";
1417  chr = reference.getChromosome(chromosomeName); // 0-based
1418  chrIndex = 100000; // 1-based
1419 
1420  index = reference.getChromosomeStart(chr) + chrIndex - 1;
1421 
1422  std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
1423 
1424  reference.close();
1425 }
1426 
1427 void testGenomeSequence(void)
1428 {
1429  GenomeSequence reference;
1430 
1431 #if 0
1432  std::string referenceName = "someotherreference";
1433  if (reference.setFastaName(referenceName))
1434  {
1435  std::cerr << "failed to open reference file "
1436  << referenceName
1437  << std::endl;
1438  exit(1);
1439  }
1440 #endif
1441 
1442  std::cerr << "open and prefetch the reference genome: ";
1443 
1444  // open it
1445  if (reference.open())
1446  {
1447  exit(1);
1448  }
1449  std::cerr << "done!" << std::endl;
1450 
1451  //
1452  // For the human genome, genomeIndex ranges from 0 to 3.2x10^9
1453  //
1454  genomeIndex_t genomeIndex; // 0 based
1455  unsigned int chromosomeIndex; // 1 based
1456  unsigned int chromosome; // 0..23 or so
1457  std::string chromosomeName;
1458 
1459  //
1460  // Here we'll start with a chromosome name, then obtain the genome
1461  // index, and use it to find the base we want:
1462  //
1463  chromosomeName = "2";
1464  chromosomeIndex = 1234567;
1465  // this call is slow (string search for chromsomeName):
1466  genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
1467  assert(genomeIndex!=INVALID_GENOME_INDEX);
1468  std::cout << "Chromosome " << chromosomeName << ", index ";
1469  std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
1470  std::cout << " at genome index position " << genomeIndex << std::endl;
1471 
1472  //
1473  // now reverse it - given a genomeIndex from above, find the chromosome
1474  // name and index:
1475  //
1476 
1477  // slow (binary search on genomeIndex):
1478  chromosome = reference.getChromosome(genomeIndex);
1479  unsigned int newChromosomeIndex;
1480  // not slow:
1481  newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
1482 
1483  assert(chromosomeIndex == newChromosomeIndex);
1484 
1485  // more testing... at least test and use PackedRead:
1486  //
1487  PackedRead pr;
1488 
1489  pr.set("ATCGATCG", 0);
1490  assert(pr.size()==8);
1491  assert(pr[0]==BaseAsciiMap::base2int[(int) 'A']);
1492  assert(pr[1]==BaseAsciiMap::base2int[(int) 'T']);
1493  assert(pr[2]==BaseAsciiMap::base2int[(int) 'C']);
1494  assert(pr[3]==BaseAsciiMap::base2int[(int) 'G']);
1495  pr.set("ATCGATCG", 1);
1496  assert(pr.size()==9);
1497  pr.set("", 0);
1498  assert(pr.size()==0);
1499  pr.set("", 1);
1500  assert(pr.size()==1);
1501  pr.set("", 2);
1502  assert(pr.size()==2);
1503  pr.set("", 3);
1504  assert(pr.size()==3);
1505  assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']);
1506  assert(pr[1]==BaseAsciiMap::base2int[(int) 'N']);
1507  assert(pr[2]==BaseAsciiMap::base2int[(int) 'N']);
1508  pr.set("C", 1);
1509  assert(pr.size()==2);
1510  assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']);
1511  assert(pr[1]==BaseAsciiMap::base2int[(int) 'C']);
1512 
1513 }
1514 
1515 //
1516 // After I build libcsg, I compile and run this test code using:
1517 //
1518 // g++ -DTEST -o try GenomeSequence.cpp -L. -lcsg -lm -lz -lssl
1519 // you also may need -fno-rtti
1520 // ./try
1521 //
1522 int main(int argc, const char **argv)
1523 {
1524 #if 1
1525  simplestExample();
1526 #else
1527  testGenomeSequence();
1528 #endif
1529  exit(0);
1530 }
1531 
1532 #endif
GenomeSequence::~GenomeSequence
~GenomeSequence()
Close the file if open and destroy the object.
Definition: GenomeSequence.cpp:169
String
Definition: StringBasics.h:39
InputFile::DEFAULT
@ DEFAULT
Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED.
Definition: InputFile.h:45
GenomeSequence::getChromosomeStart
genomeIndex_t getChromosomeStart(int chromosomeIndex) const
given a chromosome, return the genome base it starts in
Definition: GenomeSequence.h:246
GenomeSequence
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Definition: GenomeSequence.h:100
MemoryMap::open
virtual bool open(const char *file, int flags=O_RDONLY)
open a previously created mapped vector
Definition: MemoryMap.cpp:156
ifopen
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
GenomeSequence::GenomeSequence
GenomeSequence()
Simple constructor - no implicit file open.
Definition: GenomeSequence.cpp:139
GenomeSequence::getChromosomeCount
int getChromosomeCount() const
Return the number of chromosomes in the genome.
Definition: GenomeSequence.cpp:731
PackedRead
Definition: GenomeSequenceHelpers.h:142
MemoryMapArrayHeader
Definition: MemoryMapArray.h:65
InputFile::ifeof
int ifeof() const
Check to see if we have reached the EOF.
Definition: InputFile.h:386
PackedSequenceData
Definition: GenomeSequence.cpp:382
InputFile::readTilTab
int readTilTab(std::string &field)
Read, appending the characters into the specified string until tab, new line, or EOF is found,...
Definition: InputFile.cpp:133
BaseAsciiMap::base2int
static unsigned char base2int[256+1]
Map ASCII values to a 2 (or 3) bit encoding for the base pair value for just base space (ACTGNactgn).
Definition: BaseAsciiMap.h:61
InputFile::discardLine
int discardLine()
Read until the end of the line, discarding the characters, returning -1 returned for EOF and returnin...
Definition: InputFile.cpp:95
GenomeSequence::getChromosome
int getChromosome(genomeIndex_t position) const
given a whole genome index, get the chromosome it is located in
Definition: GenomeSequence.cpp:737
MemoryMap
There are a pair of related data structures in the operating system, and also a few simple algorithms...
Definition: MemoryMap.h:156
GenomeSequence::getMismatchCount
int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
Return the mismatch count, disregarding CIGAR strings.
Definition: GenomeSequence.h:488
MemoryMapArray::open
bool open(const char *file, int flags=O_RDONLY)
open a previously created mapped vector
Definition: MemoryMapArray.h:269
MemoryMapArray
Definition: MemoryMapArray.h:142
genomeSequenceMmapHeader
Definition: GenomeSequenceHelpers.h:80
InputFile
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
GenomeSequence::isColorSpace
bool isColorSpace() const
tell us if we are a color space reference or not
Definition: GenomeSequence.h:209
GenomeSequence::setReferenceName
bool setReferenceName(std::string referenceFilename)
set the reference name that will be used in open()
Definition: GenomeSequence.cpp:254
MemoryMapArray::create
int create(const char *file, indexT elementCount, int optionalHeaderCount=0)
Create a vector with elementCount memebers.
Definition: MemoryMapArray.h:208
GenomeSequence::getSumQ
int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
brute force sumQ - no sanity checking
Definition: GenomeSequence.h:501
GenomeSequence::open
bool open(bool isColorSpace=false, int flags=O_RDONLY)
open the reference specified using GenomeSequence::setReferenceName
Definition: GenomeSequence.cpp:182
ifclose
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
operator<<
InputFile & operator<<(InputFile &stream, const std::string &str)
Write to a file using streaming.
Definition: InputFile.h:736
GenomeSequence::loadDBSNP
bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const
user friendly dbSNP loader.
Definition: GenomeSequence.cpp:1301
ChromosomeInfo
Definition: GenomeSequenceHelpers.h:41
BaseAsciiMap::int2base
static const char int2base[]
Convert from int representation to the base.
Definition: BaseAsciiMap.h:38
BaseAsciiMap::base2complement
static unsigned char base2complement[]
This table maps 5' base space to the 3' complement base space values, as well as 5' color space value...
Definition: BaseAsciiMap.h:41
GenomeSequence::getGenomePosition
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
Definition: GenomeSequence.cpp:779
MD5_CTX
Definition: CSG_MD5.h:60
GenomeSequence::getNumberBases
genomeIndex_t getNumberBases() const
return the number of bases represented in this reference
Definition: GenomeSequence.h:216