libStatGen Software  1
Pileup.h
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef __PILEUP_H__
19 #define __PILEUP_H__
20 
21 #include <stdexcept>
22 #include "SamFile.h"
23 #include "PosList.h"
24 
25 
27 {
28 public:
29  static const int DEFAULT_WINDOW_SIZE = 1024;
30 };
31 
32 template <class PILEUP_TYPE>
34 {
35 public:
36  bool operator() (PILEUP_TYPE& element)
37  {
38  element.analyze();
39  return(true);
40  }
41  void analyze(PILEUP_TYPE element)
42  {
43  element.analyze();
44  }
45 };
46 
47 template <class PILEUP_TYPE>
48 void defaultPileupAnalyze(PILEUP_TYPE& element)
49 {
50  element.analyze();
51 }
52 
53 
54 /// Class to perform a pileup of all reads by position, assuming
55 /// the reads are coordinate sorted.
56 template <class PILEUP_TYPE,
57  class FUNC_CLASS = defaultPileup<PILEUP_TYPE> >
58 class Pileup
59 {
60 public:
61  /// Constructor using the default maximum number of bases a read spans.
62  Pileup(const FUNC_CLASS& fp = FUNC_CLASS());
63 
64  /// Constructor that sets the maximum number of bases a read spans.
65  /// This is the "window" the length of the buffer that holds
66  /// the pileups for each position until the read start has moved
67  /// past the position.
68  Pileup(int window, const FUNC_CLASS& fp = FUNC_CLASS());
69 
70  /// Perform pileup with a reference.
71  Pileup(const std::string& refSeqFileName,
72  const FUNC_CLASS& fp = FUNC_CLASS());
73 
74  /// Perform pileup with a reference and a specified window size.
75  Pileup(int window, const std::string& refSeqFileName,
76  const FUNC_CLASS& fp = FUNC_CLASS());
77 
78  /// Destructor
79  virtual ~Pileup();
80 
81  /// Performs a pileup on the specified file.
82  /// \param excludeFlag if specified, if any bit set in the exclude flag
83  /// is set in the record's flag, it will be dropped.
84  /// Defaulted to exclude:
85  /// * unmapped,
86  /// * not primary alignment
87  /// * failed platform/vendor quality check
88  /// * PCR or optical duplicate
89  /// \param includeFlag if specified, every bit must be set in the
90  /// record's flag for it to be included -
91  /// defaulted to 0, no bits are required to be set.
92  /// \return 0 for success and non-zero for failure.
93  virtual int processFile(const std::string& fileName,
94  uint16_t excludeFlag = 0x0704,
95  uint16_t includeFlag = 0);
96 
97  /// Add an alignment to the pileup.
98  virtual void processAlignment(SamRecord& record);
99 
100  /// Add only positions that fall within the specified region of the
101  /// alignment to the pileup and outside of the specified excluded positions.
102  /// \param record alignment to be added to the pileup.
103  /// \param startPos 0-based start position of the bases that should be
104  /// added to the pileup.
105  /// \param endPos 0-based end position of the bases that should be added
106  /// to the pileup (this position is not added).
107  /// Set to -1 if there is no end position to the region.
108  /// \param excludeList list of refID/positions to exclude from processing.
109  virtual void processAlignmentRegion(SamRecord& record,
110  int startPos,
111  int endPos,
112  PosList* excludeList = NULL);
113 
114  /// Done processing, flush every position that is currently being stored
115  /// in the pileup.
116  void flushPileup();
117 
118 protected:
119  FUNC_CLASS myAnalyzeFuncPtr;
120 
121  // Always need the reference position.
122  void addAlignmentPosition(int refPosition, SamRecord& record);
123 
124 
125  virtual void flushPileup(int refID, int refPosition);
126  void flushPileup(int refPosition);
127 
128  // Get the position in the myElements container that is associated
129  // with the specified position. If the specified position cannot
130  // fit within the myElements container, -1 is returned.
131  int pileupPosition(int refPosition);
132 
133  virtual void resetElement(PILEUP_TYPE& element, int position);
134  virtual void addElement(PILEUP_TYPE& element, SamRecord& record);
135  virtual void analyzeElement(PILEUP_TYPE& element);
136  virtual void analyzeHead();
137 
138  std::vector<PILEUP_TYPE> myElements;
139 
140  int pileupStart;
141  int pileupHead;
142  int pileupTail; // last used position
143  int pileupWindow;
144 
145  int myCurrentRefID;
146 
147  GenomeSequence* myRefPtr;
148 };
149 
150 
151 template <class PILEUP_TYPE, class FUNC_CLASS>
153  : myAnalyzeFuncPtr(fp),
154  myElements(),
155  pileupStart(0),
156  pileupHead(0),
157  pileupTail(-1),
158  pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE),
159  myCurrentRefID(-2),
160  myRefPtr(NULL)
161 {
162  // Not using pointers since this is templated.
163  myElements.resize(pileupWindow);
164 }
165 
166 
167 template <class PILEUP_TYPE, class FUNC_CLASS>
168 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const FUNC_CLASS& fp)
169  : myAnalyzeFuncPtr(fp),
170  myElements(),
171  pileupStart(0),
172  pileupHead(0),
173  pileupTail(-1),
174  pileupWindow(window),
175  myCurrentRefID(-2),
176  myRefPtr(NULL)
177 {
178  // Not using pointers since this is templated.
179  myElements.resize(window);
180 }
181 
182 
183 template <class PILEUP_TYPE, class FUNC_CLASS>
184 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const std::string& refSeqFileName, const FUNC_CLASS& fp)
185  : myAnalyzeFuncPtr(fp),
186  myElements(),
187  pileupStart(0),
188  pileupHead(0),
189  pileupTail(-1),
190  pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE),
191  myCurrentRefID(-2),
192  myRefPtr(NULL)
193 {
194  myRefPtr = new GenomeSequence(refSeqFileName.c_str());
195 
196  // Not using pointers since this is templated.
197  myElements.resize(pileupWindow);
198 
199  PILEUP_TYPE::setReference(myRefPtr);
200 }
201 
202 
203 template <class PILEUP_TYPE, class FUNC_CLASS>
204 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const std::string& refSeqFileName, const FUNC_CLASS& fp)
205  : myAnalyzeFuncPtr(fp),
206  myElements(),
207  pileupStart(0),
208  pileupHead(0),
209  pileupTail(-1),
210  pileupWindow(window),
211  myCurrentRefID(-2),
212  myRefPtr(NULL)
213 {
214  myRefPtr = new GenomeSequence(refSeqFileName.c_str());
215 
216  // Not using pointers since this is templated.
217  myElements.resize(window);
218 
219  PILEUP_TYPE::setReference(myRefPtr);
220 }
221 
222 
223 template <class PILEUP_TYPE, class FUNC_CLASS>
225 {
226  flushPileup();
227  if(myRefPtr != NULL)
228  {
229  delete myRefPtr;
230  myRefPtr = NULL;
231  }
232 }
233 
234 template <class PILEUP_TYPE, class FUNC_CLASS>
235 int Pileup<PILEUP_TYPE, FUNC_CLASS>::processFile(const std::string& fileName,
236  uint16_t excludeFlag,
237  uint16_t includeFlag)
238 {
239  SamFile samIn;
240  SamFileHeader header;
241  SamRecord record;
242 
243  if(myRefPtr != NULL)
244  {
245  samIn.SetReference(myRefPtr);
246  }
247 
248  if(!samIn.OpenForRead(fileName.c_str()))
249  {
250  fprintf(stderr, "%s\n", samIn.GetStatusMessage());
251  return(samIn.GetStatus());
252  }
253 
254  if(!samIn.ReadHeader(header))
255  {
256  fprintf(stderr, "%s\n", samIn.GetStatusMessage());
257  return(samIn.GetStatus());
258  }
259 
260  // The file needs to be sorted by coordinate.
262 
263  // Iterate over all records
264  while (samIn.ReadRecord(header, record))
265  {
266  uint16_t flag = record.getFlag();
267  if(flag & excludeFlag)
268  {
269  // This record has an excluded flag set,
270  // so continue to the next one.
271  continue;
272  }
273  if((flag & includeFlag) != includeFlag)
274  {
275  // This record does not have all required flags set,
276  // so continue to the next one.
277  continue;
278  }
279  processAlignment(record);
280  }
281 
282  flushPileup();
283 
284  int returnValue = 0;
285  if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
286  {
287  // Failed to read a record.
288  fprintf(stderr, "%s\n", samIn.GetStatusMessage());
289  returnValue = samIn.GetStatus();
290  }
291  return(returnValue);
292 }
293 
294 
295 template <class PILEUP_TYPE, class FUNC_CLASS>
297 {
298  int refPosition = record.get0BasedPosition();
299  int refID = record.getReferenceID();
300 
301  // Flush any elements from the pileup that are prior to this record
302  // since the file is sorted, we are done with those positions.
303  flushPileup(refID, refPosition);
304 
305  // Loop through for each reference position covered by the record.
306  // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
307  // that are related with the given reference position.
308  for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
309  {
310  addAlignmentPosition(refPosition, record);
311  }
312 }
313 
314 
315 template <class PILEUP_TYPE, class FUNC_CLASS>
317  int startPos,
318  int endPos,
319  PosList* excludeList)
320 {
321  int refPosition = record.get0BasedPosition();
322  int refID = record.getReferenceID();
323 
324  // Flush any elements from the pileup that are prior to this record
325  // since the file is sorted, we are done with those positions.
326  flushPileup(refID, refPosition);
327 
328  // Check if the region starts after this reference starts. If so,
329  // we only want to start adding at the region start position.
330  if(startPos > refPosition)
331  {
332  refPosition = startPos;
333  }
334 
335  // Loop through for each reference position covered by the record.
336  // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
337  // that are related with the given reference position.
338  for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
339  {
340  // Check to see if we have gone past the end of the region, in which
341  // case we can stop processing this record. Check >= since the
342  // end position is not in the region.
343  if((endPos != -1) && (refPosition >= endPos))
344  {
345  break;
346  }
347 
348  // Check to see if this position is in the exclude list.
349  bool addPos = true;
350  if(excludeList != NULL)
351  {
352  // There is an exclude list, so lookup the position.
353  if(excludeList->hasPosition(refID, refPosition))
354  {
355  // This position is in the exclude list, so don't add it.
356  addPos = false;
357  }
358  }
359  if(addPos)
360  {
361  addAlignmentPosition(refPosition, record);
362  }
363  }
364 }
365 
366 
367 template <class PILEUP_TYPE, class FUNC_CLASS>
369 {
370  // while there are still entries between the head and tail, flush,
371  // but no need to flush if pileupTail == -1 because in that case
372  // no entries have been added
373  while ((pileupHead <= pileupTail) && (pileupTail != -1))
374  {
375  flushPileup(pileupHead+1);
376  }
377  pileupStart = pileupHead = 0;
378  pileupTail = -1;
379 }
380 
381 
382 // Always need the reference position.
383 template <class PILEUP_TYPE, class FUNC_CLASS>
385  SamRecord& record)
386 {
387  int offset = 0;
388  try{
389  offset = pileupPosition(refPosition);
390  }
391  catch(std::runtime_error& err)
392  {
393  const char* overflowErr = "Overflow on the pileup buffer:";
394  String errorMessage = err.what();
395  if(strncmp(err.what(), overflowErr, strlen(overflowErr)) == 0)
396  {
397 
398  errorMessage += "\n\tPileup Buffer Overflow: recordName = ";
399  errorMessage += record.getReadName();
400  errorMessage += "; Cigar = ";
401  errorMessage += record.getCigar();
402  }
403  throw std::runtime_error(errorMessage.c_str());
404  }
405 
406  if((offset < 0) || (offset >= pileupWindow))
407  {
408  std::cerr << "Pileup Buffer Overflow: position = " << refPosition
409  << "; refID = " << record.getReferenceID()
410  << "; recStartPos = " << record.get1BasedPosition()
411  << "; pileupStart = " << pileupStart
412  << "; pileupHead = " << pileupHead
413  << "; pileupTail = " << pileupTail;
414  }
415 
416  addElement(myElements[offset], record);
417 }
418 
419 
420 template <class PILEUP_TYPE, class FUNC_CLASS>
421 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int refID, int position)
422 {
423  // if new chromosome, flush the entire pileup.
424  if(refID != myCurrentRefID)
425  {
426  // New chromosome, flush everything.
427  flushPileup();
428  myCurrentRefID = refID;
429  }
430  else
431  {
432  // on the same chromosome, so flush just up to this new position.
433  flushPileup(position);
434  }
435 }
436 
437 
438 template <class PILEUP_TYPE, class FUNC_CLASS>
440 {
441  // Flush up to this new position, but no reason to flush if
442  // pileupHead has not been set.
443  while((pileupHead < position) && (pileupHead <= pileupTail))
444  {
445  analyzeHead();
446 
447  pileupHead++;
448 
449  if(pileupHead - pileupStart >= pileupWindow)
450  pileupStart += pileupWindow;
451  }
452 
453  if(pileupHead > pileupTail)
454  {
455  // All positions have been flushed, so reset pileup info
456  pileupHead = pileupStart = 0;
457  pileupTail = -1;
458  }
459 }
460 
461 
462 // Get the position in the myElements container that is associated
463 // with the specified position. If the specified position cannot
464 // fit within the myElements container, -1 is returned.
465 template <class PILEUP_TYPE, class FUNC_CLASS>
467 {
468  // Check to see if this is the first position (pileupTail == -1)
469  if(pileupTail == -1)
470  {
471  pileupStart = pileupHead = position;
472  // This is the first time this position is being used, so
473  // reset the element.
474  resetElement(myElements[0], position);
475  pileupTail = position;
476  return(0);
477  }
478 
479 
480  if((position < pileupHead) || (position > (pileupHead + pileupWindow)))
481  {
482  String errorMessage =
483  "Overflow on the pileup buffer: specifiedPosition = ";
484  errorMessage += position;
485  errorMessage += ", pileup buffer start position: ";
486  errorMessage += pileupHead;
487  errorMessage += ", pileup buffer end position: ";
488  errorMessage += pileupHead + pileupWindow;
489 
490  throw std::runtime_error(errorMessage.c_str());
491  }
492 
493  // int offset = position - pileupStart;
494  int offset = position - pileupStart;
495 
496  if(offset >= pileupWindow)
497  {
498  offset -= pileupWindow;
499  }
500 
501  // Check to see if position is past the end of the currently
502  // setup pileup positions.
503  while(position > pileupTail)
504  {
505  // Increment pileupTail to the next position since the current
506  // pileupTail is already in use.
507  ++pileupTail;
508 
509  // Figure out the offset for this next position.
510  offset = pileupTail - pileupStart;
511  if(offset >= pileupWindow)
512  {
513  offset -= pileupWindow;
514  }
515 
516  // This is the first time this position is being used, so
517  // reset the element.
518  resetElement(myElements[offset], pileupTail);
519  }
520 
521  return(offset);
522 }
523 
524 
525 template <class PILEUP_TYPE, class FUNC_CLASS>
526 void Pileup<PILEUP_TYPE, FUNC_CLASS>::resetElement(PILEUP_TYPE& element,
527  int position)
528 {
529  element.reset(position);
530 }
531 
532 
533 template <class PILEUP_TYPE, class FUNC_CLASS>
534 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addElement(PILEUP_TYPE& element,
535  SamRecord& record)
536 {
537  element.addEntry(record);
538 }
539 
540 
541 template <class PILEUP_TYPE, class FUNC_CLASS>
542 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeElement(PILEUP_TYPE& element)
543 {
544  myAnalyzeFuncPtr(element);
545 }
546 
547 
548 template <class PILEUP_TYPE, class FUNC_CLASS>
550 {
551  myAnalyzeFuncPtr(myElements[pileupHead - pileupStart]);
552 }
553 
554 
555 #endif
SamFile::COORDINATE
@ COORDINATE
file is sorted by coordinate.
Definition: SamFile.h:49
SamRecord::get0BasedPosition
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1307
SamRecord::get0BasedAlignmentEnd
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
Definition: SamRecord.cpp:1455
SamRecord::getReferenceID
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
Definition: SamRecord.cpp:1293
String
Definition: StringBasics.h:39
SamFile::GetStatusMessage
const char * GetStatusMessage()
Get the Status Message of the last call that sets status.
Definition: SamFile.h:213
defaultPileup
Definition: Pileup.h:34
StatGenStatus::NO_MORE_RECS
@ NO_MORE_RECS
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...
Definition: StatGenStatus.h:36
SamFile::GetStatus
SamStatus::Status GetStatus()
Get the Status of the last call that sets status.
Definition: SamFile.h:207
GenomeSequence
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Definition: GenomeSequence.h:100
PosList
Store refID/position, but does not store values < 0.
Definition: PosList.h:25
SamFile::ReadRecord
bool ReadRecord(SamFileHeader &header, SamRecord &record)
Reads the next record from the file & stores it in the passed in record.
Definition: SamFile.cpp:501
Pileup::Pileup
Pileup(const std::string &refSeqFileName, const FUNC_CLASS &fp=FUNC_CLASS())
Perform pileup with a reference.
Definition: Pileup.h:184
Pileup::flushPileup
void flushPileup()
Done processing, flush every position that is currently being stored in the pileup.
Definition: Pileup.h:368
SamRecord::getFlag
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1372
Pileup::processAlignmentRegion
virtual void processAlignmentRegion(SamRecord &record, int startPos, int endPos, PosList *excludeList=NULL)
Add only positions that fall within the specified region of the alignment to the pileup and outside o...
Definition: Pileup.h:316
Pileup::~Pileup
virtual ~Pileup()
Destructor.
Definition: Pileup.h:224
SamRecord::get1BasedPosition
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
Definition: SamRecord.cpp:1300
Pileup::Pileup
Pileup(const FUNC_CLASS &fp=FUNC_CLASS())
Constructor using the default maximum number of bases a read spans.
Definition: Pileup.h:152
Pileup::Pileup
Pileup(int window, const std::string &refSeqFileName, const FUNC_CLASS &fp=FUNC_CLASS())
Perform pileup with a reference and a specified window size.
Definition: Pileup.h:204
SamRecord::getReadName
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
Definition: SamRecord.cpp:1530
PileupHelper
Definition: Pileup.h:27
Pileup::Pileup
Pileup(int window, const FUNC_CLASS &fp=FUNC_CLASS())
Constructor that sets the maximum number of bases a read spans.
Definition: Pileup.h:168
SamFileHeader
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:35
SamRecord::getCigar
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1543
SamFile::setSortedValidation
void setSortedValidation(SortedType sortType)
Set the flag to validate that the file is sorted as it is read/written.
Definition: SamFile.cpp:669
SamRecord
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition: SamRecord.h:52
SamFile::OpenForRead
bool OpenForRead(const char *filename, SamFileHeader *header=NULL)
Open a sam/bam file for reading with the specified filename, determing the type of file and SAM/BAM b...
Definition: SamFile.cpp:93
Pileup::processFile
virtual int processFile(const std::string &fileName, uint16_t excludeFlag=0x0704, uint16_t includeFlag=0)
Performs a pileup on the specified file.
Definition: Pileup.h:235
SamFile::ReadHeader
bool ReadHeader(SamFileHeader &header)
Reads the header section from the file and stores it in the passed in header.
Definition: SamFile.cpp:437
SamFile
Allows the user to easily read/write a SAM/BAM file.
Definition: SamFile.h:36
Pileup::processAlignment
virtual void processAlignment(SamRecord &record)
Add an alignment to the pileup.
Definition: Pileup.h:296
SamFile::SetReference
void SetReference(GenomeSequence *reference)
Sets the reference to the specified genome sequence object.
Definition: SamFile.cpp:380
Pileup
Class to perform a pileup of all reads by position, assuming the reads are coordinate sorted.
Definition: Pileup.h:59
PosList::hasPosition
bool hasPosition(int refID, int refPosition)
Return whether or not this list contains the specified reference ID and position (negative values wil...
Definition: PosList.cpp:81