libStatGen Software  1
SamFileHeader.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "SamFileHeader.h"
19 #include "SamHeaderSQ.h"
20 #include "SamHeaderRG.h"
21 
22 
23 const std::string SamFileHeader::EMPTY_RETURN = "";
24 
25 SamFileHeader::SamFileHeader()
26  : myHD(NULL),
27  myReferenceInfo(),
28  myErrorMessage("")
29 {
30  resetHeader();
31 
32  mySQs.setCaseSensitive(true);
33  myRGs.setCaseSensitive(true);
34  myPGs.setCaseSensitive(true);
35 }
36 
37 
38 SamFileHeader::~SamFileHeader()
39 {
40  resetHeader();
41 }
42 
43 
44 // Copy Constructor
45 SamFileHeader::SamFileHeader(const SamFileHeader& header)
46 {
47  copy(header);
48 }
49 
50 
51 // Overload operator = to copy the passed in header into this header.
53 {
54  copy(header);
55  return(*this);
56 }
57 
58 
60 {
61  // Check to see if the passed in value is the same as this.
62  if(this == &header)
63  {
64  return(true);
65  }
66 
67  resetHeader();
68 
69  // Copy the records by getting the other header's header string
70  // and parsing it.
71  std::string newString;
72  bool status = header.getHeaderString(newString);
73  String newHeaderString = newString.c_str();
74 
75  status &= parseHeader(newHeaderString);
76 
77  myCurrentHeaderIndex = header.myCurrentHeaderIndex;
78  myCurrentCommentIndex = header.myCurrentCommentIndex;
79 
80  // Clear the reference info and copy it to ensure it is the same.
81  myReferenceInfo.clear();
82  // Copy Reference contigs, hash, lengths.
83  myReferenceInfo = header.myReferenceInfo;
84 
85  return(status);
86 }
87 
88 
89 // Reset the header for a new entry, clearing out previous values.
91 {
92  myReferenceInfo.clear();
93 
94  // Clear the pointers to the header records. They are deleted when the
95  // vector is cleaned up.
96  myHD = NULL;
97  mySQs.Clear();
98  myRGs.Clear();
99  myPGs.Clear();
100 
101  // Delete the header records and clear the vector.
102  for(unsigned int headerIndex = 0; headerIndex < myHeaderRecords.size();
103  headerIndex++)
104  {
105  if(myHeaderRecords[headerIndex] != NULL)
106  {
107  delete myHeaderRecords[headerIndex];
108  myHeaderRecords[headerIndex] = NULL;
109  }
110  }
111  myHeaderRecords.clear();
112 
113  // Reset the iterator for the header lines.
115 
116  // Reset the comment iterator.
118 
119  // Reset the individual type header iterators.
123 
124  // Clear the comments
125  myComments.clear();
126 }
127 
128 
129 // Set the passed in string to the entire header string. Clearing its
130 // current contents.
131 bool SamFileHeader::getHeaderString(std::string& header) const
132 {
133  header.clear();
134 
135  // Keep getting header lines until there are no more - false returned.
136  unsigned int index = 0;
137  while(getHeaderLine(index, header) != false)
138  {
139  ++index;
140  }
141 
142  return(true);
143 }
144 
145 
146 int SamFileHeader::getReferenceID(const String & referenceName, bool addID)
147 {
148  return(myReferenceInfo.getReferenceID(referenceName, addID));
149 }
150 
151 
152 int SamFileHeader::getReferenceID(const char* referenceName, bool addID)
153 {
154  return(myReferenceInfo.getReferenceID(referenceName, addID));
155 }
156 
157 
159 {
160  return(myReferenceInfo.getReferenceLabel(id));
161 }
162 
163 
164 // Get the Reference Information
166 {
167  return(myReferenceInfo);
168 }
169 
170 
171 // Get the Reference Information for updating separately when reading
172 // BAMs...should only be called by BamInterface.
173 SamReferenceInfo& SamFileHeader::getReferenceInfoForBamInterface()
174 {
175  return(myReferenceInfo);
176 }
177 
178 
179 // Add a header line that has an const char* value.
180 bool SamFileHeader::addHeaderLine(const char* type, const char* tag,
181  const char* value)
182 {
183  String headerLine;
184  headerLine += "@";
185  headerLine += type;
186  headerLine += "\t";
187  headerLine += tag;
188  headerLine += ":";
189  headerLine += value;
190  return(addHeaderLine(headerLine.c_str()));
191 }
192 
193 
194 // Add a header line that is already preformatted in a const char*.
195 bool SamFileHeader::addHeaderLine(const char* headerLine)
196 {
197  // Parse the added header line.
198  String headerString = headerLine;
199  return(parseHeader(headerString));
200 }
201 
202 
203 // Add a header line that is already preformatted in a const char*.
204 bool SamFileHeader::addHeader(const char* header)
205 {
206  // Parse the added header line.
207  String headerString = header;
208  return(parseHeader(headerString));
209 }
210 
211 
212 // Add a comment.
213 bool SamFileHeader::addComment(const char* comment)
214 {
215  if((comment != NULL) && (strcmp(comment, EMPTY_RETURN.c_str()) != 0))
216  {
217  // Valid comment, so add it.
218  myComments.push_back(comment);
219  }
220  return(true);
221 }
222 
223 
224 // Add the specified tag and value to the HD header.
225 bool SamFileHeader::setHDTag(const char* tag, const char* value)
226 {
227  if(myHD == NULL)
228  {
229  // Need to create the HD line.
230  myHD = new SamHeaderHD();
231  if(myHD == NULL)
232  {
233  // New failed, return false.
234  myErrorMessage = "SamFileHeader: Failed to allocate a new HD tag";
235  return(false);
236  }
237  // Succeeded to create the line, add it to the
238  // list.
239  myHeaderRecords.push_back(myHD);
240  }
241  if(!myHD->setTag(tag, value))
242  {
243  myErrorMessage = "SamFileHeader: Failed to set the specified HD tag";
244  return(false);
245  }
246  return(true);
247 }
248 
249 
250 // Add the specified tag and value to the SQ header with the specified name.
251 // If the header does not yet exist, the header is added.
252 bool SamFileHeader::setSQTag(const char* tag, const char* value,
253  const char* name)
254 {
255  // Get the SQ record for the specified name.
256  SamHeaderSQ* sq = getSQ(name);
257  if(sq == NULL)
258  {
259  // The SQ does not yet exist.
260  // Make sure the tag is LN.
261  if(strcmp(tag, "LN") != 0)
262  {
263  // LN is required so must be the first tag added
264  myErrorMessage =
265  "SamFileHeader:Failed to add the specified SQ key, LN not specified.";
266  return(false);
267  }
268 
269  // Add it.
270  sq = new SamHeaderSQ();
271 
272  if(sq == NULL)
273  {
274  // Could not create the header record.
275  myErrorMessage = "SamFileHeader: Failed to allocate a new SQ tag";
276  return(false);
277  }
278 
279  // Created the header record, so add it to the list of SQ lines.
280  mySQs.Add(name, sq);
281  myHeaderRecords.push_back(sq);
282  // value is the length, so update the reference info.
283  myReferenceInfo.add(name, atoi(value));
284 
285  // Add the key tag
286  if(!sq->addKey(name))
287  {
288  // Failed to add the key tag, return false.
289  myErrorMessage = "SamFileHeader:Failed to add the specified SQ key";
290  return(false);
291  }
292  }
293  else if(strcmp(tag, "LN") == 0)
294  {
295  // Cannot modify/remove the LN tag.
296  myErrorMessage = "SamFileHeader:Cannot modify/remove the SQ's LN tag";
297  return(false);
298  }
299 
300  if(!sq->setTag(tag, value))
301  {
302  myErrorMessage = "Failed to set the specified SQ tag";
303  return(false);
304  }
305  return(true);
306 }
307 
308 
309 // Add the specified tag and value to the RG header with the read group
310 // identifier. If the header does not yet exist, the header is added.
311 bool SamFileHeader::setRGTag(const char* tag, const char* value, const char* id)
312 {
313  // Get the RG record for the specified name.
314  SamHeaderRG* rg = getRG(id);
315  if(rg == NULL)
316  {
317  // The RG does not yet exist.
318  // Add it.
319  rg = new SamHeaderRG();
320 
321  if(rg == NULL)
322  {
323  // Could not create the header record.
324  myErrorMessage = "Failed to allocate a new RG tag";
325  return(false);
326  }
327 
328  // Created the header record, so add it to the list of RG lines.
329  myRGs.Add(id, rg);
330  myHeaderRecords.push_back(rg);
331 
332  // Add the key tag
333  if(!rg->addKey(id))
334  {
335  // Failed to add the key tag, return false.
336  myErrorMessage = "Failed to add the specified RG key";
337  return(false);
338  }
339  }
340 
341  if(!rg->setTag(tag, value))
342  {
343  myErrorMessage = "Failed to set the specified RG tag";
344  return(false);
345  }
346  return(true);
347 }
348 
349 
350 // Add the specified tag and value to the PG header with the specified id.
351 // If the header does not yet exist, the header is added.
352 // Add the specified tag and value to the PG header.
353 bool SamFileHeader::setPGTag(const char* tag, const char* value, const char* id)
354 {
355  // Get the PG record for the specified name.
356  SamHeaderPG* pg = getPG(id);
357  if(pg == NULL)
358  {
359  // The PG does not yet exist.
360  // Add it.
361  pg = new SamHeaderPG();
362 
363  if(pg == NULL)
364  {
365  // Could not create the header record.
366  myErrorMessage = "Failed to allocate a new PG tag";
367  return(false);
368  }
369 
370  // Created the header record, so add it to the list of PG lines.
371  myPGs.Add(id, pg);
372  myHeaderRecords.push_back(pg);
373 
374  // Add the key tag
375  if(!pg->addKey(id))
376  {
377  // Failed to add the key tag, return false.
378  myErrorMessage = "Failed to add the specified PG key";
379  return(false);
380  }
381  }
382 
383  if(!pg->setTag(tag, value))
384  {
385  myErrorMessage = "Failed to set the specified PG tag";
386  return(false);
387  }
388  return(true);
389 }
390 
391 
392 // Add the HD record to the header.
394 {
395  // If there is already an HD header or if null
396  // was passed in, return false.
397  if(myHD != NULL)
398  {
399  myErrorMessage = "Failed add an HD tag - there is already one";
400  return(false);
401  }
402  if(hd == NULL)
403  {
404  myErrorMessage = "Failed add an HD tag - no tag specified";
405  return(false);
406  }
407  myHD = hd;
408 
409  myHeaderRecords.push_back(myHD);
410  return(true);
411 }
412 
413 
414 // Add the SQ record to the header.
416 {
417  if(sq == NULL)
418  {
419  // null pointer passed in, can't add it.
420  myErrorMessage = "SAM/BAM Header line failed to allocate SQ.";
421  return(false);
422  }
423  const char* name = sq->getTagValue("SN");
424  const char* length = sq->getTagValue("LN");
425  if(strcmp(name, EMPTY_RETURN.c_str()) == 0)
426  {
427  // SN is not set, so can't add it.
428  myErrorMessage =
429  "SAM/BAM Header line failure: Skipping SQ line that is missing the SN field.";
430  return(false);
431  }
432  if(strcmp(length, EMPTY_RETURN.c_str()) == 0)
433  {
434  // LN is not set, so can't add it.
435  myErrorMessage =
436  "SAM/BAM Header line failure: Skipping SQ line that is missing the LN field.";
437  return(false);
438  }
439 
440  // Determine whether or not a record with this
441  // key is already in the hash.
442  if(mySQs.Find(name) < 0)
443  {
444  // It is not already in the hash so add it.
445  mySQs.Add(name, sq);
446  myHeaderRecords.push_back(sq);
447  myReferenceInfo.add(name, atoi(length));
448  return(true);
449  }
450 
451  // It is already in the hash, so cannot be added.
452  myErrorMessage = "SAM/BAM Header line failure: Skipping SQ line that has a repeated SN field.";
453  return(false);
454 }
455 
456 
457 // Add the RG record to the header.
459 {
460  if(rg == NULL)
461  {
462  // null pointer passed in, can't add it.
463  myErrorMessage = "SAM/BAM Header line failed to allocate RG.";
464  return(false);
465  }
466  const char* id = rg->getTagValue("ID");
467  if(strcmp(id, EMPTY_RETURN.c_str()) == 0)
468  {
469  // ID is not set, so can't add it.
470  myErrorMessage = "SAM/BAM Header line failure: Skipping RG line that is missing the ID field.";
471  return(false);
472  }
473 
474  // Determine whether or not a record with this
475  // key is already in the hash.
476  if(myRGs.Find(id) < 0)
477  {
478  // It is not already in the hash so
479  // add it.
480  myRGs.Add(id, rg);
481  myHeaderRecords.push_back(rg);
482  return(true);
483  }
484 
485  // It is already in the hash, so cannot be added.
486  myErrorMessage = "SAM/BAM Header line failure: Skipping RG line that has a repeated ID field.";
487  return(false);
488 }
489 
490 
491 // Add the PG record to the header.
493 {
494  // If a null pointer was passed in, return false.
495  if(pg == NULL)
496  {
497  myErrorMessage = "SAM/BAM Header line failed to allocate PG.";
498  return(false);
499  }
500  const char* id = pg->getTagValue("ID");
501  if(strcmp(id, EMPTY_RETURN.c_str()) == 0)
502  {
503  // ID is not set, so can't add the header record.
504  myErrorMessage = "SAM/BAM Header line failure: Skipping PG line that is missing the ID field.";
505  return(false);
506  }
507 
508  // Determine whether or not a record with this
509  // key is already in the hash.
510  if(myPGs.Find(id) < 0)
511  {
512  // It is not already in the hash so
513  // add it.
514  myPGs.Add(id, pg);
515  myHeaderRecords.push_back(pg);
516  return(true);
517  }
518 
519  // It is already in the hash, so cannot be added.
520  myErrorMessage = "SAM/BAM Header line failure: Skipping PG line that has a repeated ID field.";
521  return(false);
522 }
523 
524 
525 // Add the RG record to the header.
527 {
528  SamHeaderRecord* newRec = hdrRec.createCopy();
529  bool returnVal = true;
530  switch(newRec->getType())
531  {
532  case SamHeaderRecord::HD:
533  returnVal = addHD((SamHeaderHD*)newRec);
534  break;
535  case SamHeaderRecord::PG:
536  returnVal = addPG((SamHeaderPG*)newRec);
537  break;
538  case SamHeaderRecord::RG:
539  returnVal = addRG((SamHeaderRG*)newRec);
540  break;
541  case SamHeaderRecord::SQ:
542  returnVal = addSQ((SamHeaderSQ*)newRec);
543  break;
544  default:
545  myErrorMessage = "Failed to copy a header record, unknown type.";
546  returnVal = false;
547  break;
548  }
549  return(returnVal);
550 }
551 
552 
553 // Remove the HD record.
555 {
556  if(myHD != NULL)
557  {
558  // Reset the record. Do not delete it since it is in the headerRecords
559  // vector and it is not worth the time to remove it from the middle of
560  // that vector since this is the header and the space does not need
561  // to be conserved.
562  myHD->reset();
563 
564  // Set myHD to null so a new HD could be added.
565  myHD = NULL;
566  }
567 
568  return(true);
569 }
570 
571 
572 // Remove the SQ record associated with the specified name.
573 bool SamFileHeader::removeSQ(const char* name)
574 {
575  // Look up the name in the hash.
576  int hashIndex = mySQs.Find(name);
577  if(hashIndex < 0)
578  {
579  // Not found in the hash, so nothing to
580  // delete, return true it does not exist
581  // in the hash.
582  return(true);
583  }
584 
585  // Get the SQ.
586  SamHeaderSQ* sq = (SamHeaderSQ*)(mySQs.Object(hashIndex));
587 
588  if(sq == NULL)
589  {
590  // sq is null, this is an error since hashIndex was greater than 0,
591  // so it should have been found.
592  myErrorMessage = "SAM/BAM Header line failed to get SQ object.";
593  return(false);
594  }
595 
596  // Reset the record. Do not delete it since it is in the headerRecords
597  // vector and it is not worth the time to remove it from the middle of
598  // that vector since this is the header and the space does not need
599  // to be conserved.
600  sq->reset();
601 
602  // Delete the entry from the hash.
603  mySQs.Delete(hashIndex);
604 
605  return(true);
606 }
607 
608 
609 // Remove the RG record associated with the specified id.
610 bool SamFileHeader::removeRG(const char* id)
611 {
612  // Look up the id in the hash.
613  int hashIndex = myRGs.Find(id);
614  if(hashIndex < 0)
615  {
616  // Not found in the hash, so nothing to
617  // delete, return true it does not exist
618  // in the hash.
619  return(true);
620  }
621 
622  // Get the RG.
623  SamHeaderRG* rg = (SamHeaderRG*)(myRGs.Object(hashIndex));
624 
625  if(rg == NULL)
626  {
627  // rg is null, this is an error since hashIndex was greater than 0,
628  // so it should have been found.
629  myErrorMessage = "SAM/BAM Header line failed to get RG object.";
630  return(false);
631  }
632 
633  // Reset the record. Do not delete it since it is in the headerRecords
634  // vector and it is not worth the time to remove it from the middle of
635  // that vector since this is the header and the space does not need
636  // to be conserved.
637  rg->reset();
638 
639  // Delete the entry from the hash.
640  myRGs.Delete(hashIndex);
641 
642  return(true);
643 }
644 
645 
646 // Remove the PG record associated with the specified id.
647 bool SamFileHeader::removePG(const char* id)
648 {
649  // Look up the id in the hash.
650  int hashIndex = myPGs.Find(id);
651  if(hashIndex < 0)
652  {
653  // Not found in the hash, so nothing to
654  // delete, return true it does not exist
655  // in the hash.
656  return(true);
657  }
658 
659  // Get the PG.
660  SamHeaderPG* pg = (SamHeaderPG*)(myPGs.Object(hashIndex));
661 
662  if(pg == NULL)
663  {
664  // pg is null, this is an error since hashIndex was greater than 0,
665  // so it should have been found.
666  myErrorMessage = "SAM/BAM Header line failed to get PG object.";
667  return(false);
668  }
669 
670  // Reset the record. Do not delete it since it is in the headerRecords
671  // vector and it is not worth the time to remove it from the middle of
672  // that vector since this is the header and the space does not need
673  // to be conserved.
674  pg->reset();
675 
676  // Delete the entry from the hash.
677  myPGs.Delete(hashIndex);
678 
679  return(true);
680 }
681 
682 
683 const char* SamFileHeader::getHDTagValue(const char* tag)
684 {
685  if(myHD == NULL)
686  {
687  // return blank since there is no HD type.
688  return(EMPTY_RETURN.c_str());
689  }
690  return(myHD->getTagValue(tag));
691 }
692 
693 
694 // Get the value associated with the specified tag on the SQ line with
695 // the specified sequence name.
696 const char* SamFileHeader::getSQTagValue(const char* tag, const char* name)
697 {
698  // Look up the name in the hash to get the associated SQ object.
699  SamHeaderSQ* sq = (SamHeaderSQ*)(mySQs.Object(name));
700 
701  // If it is NULL - the tag was not found, so return
702  if(sq == NULL)
703  {
704  return(EMPTY_RETURN.c_str());
705  }
706 
707  // Found the object, so return the SQ Tag.
708  return(sq->getTagValue(tag));
709 }
710 
711 
712 // Get the value associated with the specified tag on the RG line with
713 // the specified read group identifier.
714 const char* SamFileHeader::getRGTagValue(const char* tag, const char* id)
715 {
716  // Look up the id in the hash to get the associated RG object.
717  SamHeaderRG* rg = (SamHeaderRG*)(myRGs.Object(id));
718 
719  // If it is NULL - the tag was not found, so return
720  if(rg == NULL)
721  {
722  return(EMPTY_RETURN.c_str());
723  }
724 
725  // Found the object, so return the RG Tag.
726  return(rg->getTagValue(tag));
727 }
728 
729 
730 const char* SamFileHeader::getPGTagValue(const char* tag, const char* id)
731 {
732  // Look up the id in the hash to get the associated PG object.
733  SamHeaderPG* pg = (SamHeaderPG*)(myPGs.Object(id));
734 
735  // If it is NULL - the tag was not found, so return
736  if(pg == NULL)
737  {
738  return(EMPTY_RETURN.c_str());
739  }
740 
741  // Found the object, so return the PG Tag.
742  return(pg->getTagValue(tag));
743 }
744 
745 
746 // Get the number of SQ objects.
748 {
749  return(mySQs.Entries());
750 }
751 
752 
753 // Get the number of RG objects.
755 {
756  return(myRGs.Entries());
757 }
758 
759 
760 // Get the number of PG objects.
762 {
763  return(myPGs.Entries());
764 }
765 
766 
767 // Get the HD object.
769 {
770  return(myHD);
771 }
772 
773 
774 // Get the SQ object with the specified sequence name.
776 {
777  return((SamHeaderSQ*)(mySQs.Object(name)));
778 }
779 
780 
781 // Get the RG object with the specified read group identifier.
783 {
784  return((SamHeaderRG*)(myRGs.Object(id)));
785 }
786 
787 
788 // Get the PG object.
790 {
791  return((SamHeaderPG*)(myPGs.Object(id)));
792 }
793 
794 
795 // Return the value of the SO tag.
796 // If this field does not exist, EMPTY_RETURN.c_str() is returned.
798 {
799  if(myHD == NULL)
800  {
801  // No HD, so return blank EMPTY_RETURN.c_str()
802  return(EMPTY_RETURN.c_str());
803  }
804  return(myHD->getSortOrder());
805 }
806 
807 
808 // Deprecated way of getting the sort order from the file.
810 {
811  return(getSortOrder());
812 }
813 
814 
815 // Get the next SQ header record. After all SQ headers have been retrieved,
816 // NULL is returned until a reset is called.
818 {
819  return(getNextHeaderRecord(myCurrentSQIndex,
821 }
822 
823 
824 // Get the next RG header record. After all RG headers have been retrieved,
825 // NULL is returned until a reset is called.
827 {
828  return(getNextHeaderRecord(myCurrentRGIndex,
830 }
831 
832 
833 // Get the next PG header record. After all PG headers have been retrieved,
834 // NULL is returned until a reset is called.
836 {
837  return(getNextHeaderRecord(myCurrentPGIndex,
839 }
840 
841 
842 // Reset to the beginning of the header records so the next call
843 // to getNextSQRecord returns the first SQ header record.
845 {
846  myCurrentSQIndex = 0;
847 }
848 
849 
850 // Reset to the beginning of the header records so the next call
851 // to getNextRGRecord returns the first RG header record.
853 {
854  myCurrentRGIndex = 0;
855 }
856 
857 
858 // Reset to the beginning of the header records so the next call
859 // to getNextPGRecord returns the first PG header record.
861 {
862  myCurrentPGIndex = 0;
863 }
864 
865 
866 // Get the next header record of the specified type.
867 // Pass in the index to start looking at and the type to look for.
868 // Update the index.
869 // After all headers of that type have been retrieved,
870 // NULL is returned until a reset is called for that type.
873 {
874  SamHeaderRecord* foundRecord = NULL;
875  // Loop until a record is found or until out of range of the
876  // headerRecord vector.
877  while((index < myHeaderRecords.size())
878  && (foundRecord == NULL))
879  {
880  // Get the next record.
881  foundRecord = myHeaderRecords[index];
882  // Either way, increment the index.
883  ++index;
884  // Check to see if the next record is active.
885  if(!foundRecord->isActiveHeaderRecord())
886  {
887  // Not active, so clear the pointer.
888  foundRecord = NULL;
889  }
890  // Check to see if the record is the right type.
891  else if(foundRecord->getType() != headerType)
892  {
893  // Not the right type, so clear the pointer.
894  foundRecord = NULL;
895  }
896  }
897 
898  // Return the record if it was found. Will be null if none were found.
899  return(foundRecord);
900 }
901 
902 
903 // Get the next header record. After all headers have been retrieved,
904 // NULL is returned until a reset is called. Does not return the
905 // Comment lines.
906 // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
907 // same iterator.
909 {
910  // Get the next header record
911  SamHeaderRecord* foundRecord = NULL;
912  // Loop until a record is found or until out of range of the
913  // headerRecord vector.
914  while((myCurrentHeaderIndex < myHeaderRecords.size())
915  && (foundRecord == NULL))
916  {
917  // Get the next record.
918  foundRecord = myHeaderRecords[myCurrentHeaderIndex];
919  // Either way, increment the index.
920  ++myCurrentHeaderIndex;
921  // Check to see if the next record is active.
922  if(!foundRecord->isActiveHeaderRecord())
923  {
924  // Not active, so clear the pointer.
925  foundRecord = NULL;
926  }
927  }
928 
929  // Return the record if it was found. Will be null if none were found.
930  return(foundRecord);
931 }
932 
933 
934 // Set the passed in string to the next header line. The passed in
935 // string will be overwritten. If there are no more header lines or there
936 // is an error, false is returned and the passed in string is set to EMPTY_RETURN.c_str()
937 // until a rest is called.
938 // Will also return the comment lines.
939 // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
940 // same iterator.
941 bool SamFileHeader::getNextHeaderLine(std::string &headerLine)
942 {
943  headerLine = EMPTY_RETURN.c_str();
944 
945  // Until the header is set, keep reading.
946  // Header could return EMPTY_RETURN.c_str() if the header line is blank.
947  while(headerLine == EMPTY_RETURN.c_str())
948  {
949  if(getHeaderLine(myCurrentHeaderIndex, headerLine) == false)
950  {
951  // getHeaderLine failed, so stop processing, and return false.
952  return(false);
953  }
954  else
955  {
956  // In range, increment the index.
957  ++myCurrentHeaderIndex;
958  }
959  }
960  return(true);
961 }
962 
963 
964 // Reset to the beginning of the header records so the next call
965 // to getNextHeaderRecord returns the first header line.
967 {
968  myCurrentHeaderIndex = 0;
969 }
970 
971 
972 void SamFileHeader::appendCommentLines(std::string &commentLines)
973 {
974  for(unsigned int i = 0; i < myComments.size(); i++)
975  {
976  commentLines += "@CO\t";;
977  commentLines += myComments[i];
978  commentLines += "\n";
979  }
980 }
981 
982 
983 // Returns the comment on the next comment line. Returns EMPTY_RETURN.c_str() if all comment
984 // lines have been returned, until resetCommentIter is called.
986 {
987  if(myCurrentCommentIndex < myComments.size())
988  {
989  return(myComments[myCurrentCommentIndex++].c_str());
990  }
991  // Already gone through all the comments, return EMPTY_RETURN.c_str().
992  return(EMPTY_RETURN.c_str());
993 }
994 
995 
996 // Resets to the beginning of the comments so getNextComment returns
997 // the first comment.
999 {
1000  myCurrentCommentIndex = 0;
1001 }
1002 
1003 
1004 // Parse the header.
1005 bool SamFileHeader::parseHeader(String& header)
1006 {
1007  std::string errorMessage = "";
1008  int numErrors = 0;
1009  int numValid = 0;
1010 
1011  // Split the header into lines.
1012  std::vector<String>* types = header.Split('\n');
1013 
1014  // Loop through each header line, parsing that line.
1015  for(uint32_t index = 0; index < types->size(); index++)
1016  {
1017  // Parse the header line.
1018  if(!parseHeaderLine(types->at(index)))
1019  {
1020  errorMessage += myErrorMessage;
1021  errorMessage += "\n";
1022  ++numErrors;
1023  }
1024  else
1025  {
1026  // valid header line
1027  ++numValid;
1028  }
1029  }
1030 
1031  // Delete the types vector.
1032  delete types;
1033  types = NULL;
1034 
1035  myErrorMessage = errorMessage;
1036  if((numErrors > 0) && (numValid == 0))
1037  {
1038  // Only errors.
1039  std::cerr << numErrors
1040  << " invalid SAM/BAM Header lines were skipped due to:\n"
1041  << errorMessage << std::endl;
1042  return(false);
1043  }
1044  else if(numErrors > 0)
1045  {
1046  // Some valid & some invalid.
1047  // Going to return true, but add note about the invalid lines.
1048  std::cerr << numErrors
1049  << " invalid SAM/BAM Header lines were skipped due to:\n"
1050  << errorMessage << std::endl;
1051  }
1052 
1053  return(true);
1054 }
1055 
1056 
1057 // Parse one line of the header.
1058 bool SamFileHeader::parseHeaderLine(const String& headerLine)
1059 {
1060  // Check if the line starts with @CO.
1061  if((headerLine.Length() >= 4) && (headerLine[0] == '@') &&
1062  (headerLine[1] == 'C') && (headerLine[2] == 'O') &&
1063  (headerLine[3] == '\t'))
1064  {
1065  // Comment line.
1066  String comment = headerLine.SubStr(4);
1067  return(addComment(comment));
1068  }
1069 
1070  StringArray tokens;
1071 
1072  // Split the line by tabs.
1073  tokens.ReplaceColumns(headerLine, '\t');
1074 
1075  if(tokens.Length() < 1)
1076  {
1077  // Nothing on this line, just return true.
1078  return(true);
1079  }
1080 
1081  // Get the header type, the first column.
1082  if((tokens[0].Length() != 3) || (tokens[0][0] != '@'))
1083  {
1084  // The header type string is incorrect. Should be 3 characters
1085  // with the first one @.
1086  myErrorMessage = "SAM/BAM Header line does not start with @ & at least 2 chars.";
1087  return(false);
1088  }
1089 
1090  bool status = true;
1091  if(tokens[0] == "@HD")
1092  {
1093  if(myHD == NULL)
1094  {
1095  // Create a new hd.
1096  myHD = new SamHeaderHD();
1097  if(myHD == NULL)
1098  {
1099  // Failed to allocate HD, so return false.
1100  myErrorMessage = "SAM/BAM Header line failed to allocate HD.";
1101  return(false);
1102  }
1103  myHeaderRecords.push_back(myHD);
1104  if(!myHD->setFields(tokens))
1105  {
1106  myErrorMessage = "SAM/BAM Header line failed to store HD record.";
1107  status = false;
1108  }
1109  }
1110  else
1111  {
1112  // HD already set, so return false.
1113  myErrorMessage = "SAM/BAM Header line failure: multiple HD records.";
1114  status = false;
1115  }
1116  }
1117  else if(tokens[0] == "@SQ")
1118  {
1119  // Create a new SQ record.
1120  SamHeaderSQ* sq = new SamHeaderSQ();
1121 
1122  if(sq->setFields(tokens))
1123  {
1124  // sq fields were properly set, so add it to the list of
1125  // SQ lines.
1126  // myStatus set in the method.
1127  status &= addSQ(sq);
1128  }
1129  else
1130  {
1131  myErrorMessage = "SAM/BAM Header line failed to store SQ record.";
1132  status = false;
1133  }
1134  }
1135  else if(tokens[0] == "@RG")
1136  {
1137  // Create a new RG record.
1138  SamHeaderRG* rg = new SamHeaderRG();
1139 
1140  if(rg->setFields(tokens))
1141  {
1142  // rg fields were properly set, so add it to the list of
1143  // RG lines.
1144  // myStatus set in the method.
1145  status &= addRG(rg);
1146  }
1147  else
1148  {
1149  myErrorMessage = "SAM/BAM Header line failed to store RG record.";
1150  status = false;
1151  }
1152  }
1153  else if(tokens[0] == "@PG")
1154  {
1155  // Create a new PG record.
1156  SamHeaderPG* pg = new SamHeaderPG();
1157 
1158  if(pg->setFields(tokens))
1159  {
1160  // pg fields were properly set, so add it to the list of
1161  // PG lines.
1162  // myStatus set in the method.
1163  status &= addPG(pg);
1164  }
1165  else
1166  {
1167  myErrorMessage = "SAM/BAM Header line failed to store PG record.";
1168  status = false;
1169  }
1170  }
1171  else
1172  {
1173  // Unknown header type.
1174  myErrorMessage =
1175  "SAM/BAM Header line failure: Skipping unknown header type, ";
1176  myErrorMessage += (const char*)(tokens[0]);
1177  status = false;
1178  }
1179  return(status);
1180 }
1181 
1182 
1183 
1184 // Set the passed in string to the header line at the specified index.
1185 // It does NOT clear the current contents of header.
1186 // NOTE: some indexes will return blank if the entry was deleted.
1187 bool SamFileHeader::getHeaderLine(unsigned int index, std::string& header) const
1188 {
1189  // Check to see if the index is in range of the header records vector.
1190  if(index < myHeaderRecords.size())
1191  {
1192  // In range of the header records vector, so get the string for
1193  // that record.
1194  SamHeaderRecord* hdrRec = myHeaderRecords[index];
1195  hdrRec->appendString(header);
1196  return(true);
1197  }
1198  else
1199  {
1200  unsigned int commentIndex = index - myHeaderRecords.size();
1201  // Check to see if it is in range of the comments.
1202  if(commentIndex < myComments.size())
1203  {
1204  // It is in range of the comments, so add the type.
1205  header += "@CO\t";
1206  // Add the comment.
1207  header += myComments[commentIndex];
1208  // Add the new line.
1209  header += "\n";
1210  return(true);
1211  }
1212  }
1213  // Invalid index.
1214  return(false);
1215 }
SamFileHeader::getRGTagValue
const char * getRGTagValue(const char *tag, const char *id)
Get the value associated with the specified tag on the RG line with the specified read group identifi...
Definition: SamFileHeader.cpp:714
SamFileHeader::getHD
SamHeaderHD * getHD()
Get the HD object, returning NULL if there is no HD record.
Definition: SamFileHeader.cpp:768
SamFileHeader::getTagSO
const char * getTagSO()
DEPRECATED.
Definition: SamFileHeader.cpp:809
SamReferenceInfo::clear
void clear()
Reset this reference info.
Definition: SamReferenceInfo.cpp:123
SamFileHeader::resetCommentIter
void resetCommentIter()
Resets to the beginning of the comments so getNextComment returns the first comment.
Definition: SamFileHeader.cpp:998
SamHeaderRecord::isActiveHeaderRecord
bool isActiveHeaderRecord()
This record is active (true) if there is at least one tag set.
Definition: SamHeaderRecord.cpp:301
SamFileHeader::removePG
bool removePG(const char *id)
Remove PG record with the specified key.
Definition: SamFileHeader.cpp:647
SamHeaderRecord::getTagValue
const char * getTagValue(const char *tag) const
Return the value associated with the specified tag.
Definition: SamHeaderRecord.cpp:100
SamFileHeader::resetPGRecordIter
void resetPGRecordIter()
Reset to the beginning of the header records so the next call to getNextPGRecord returns the first PG...
Definition: SamFileHeader.cpp:860
SamFileHeader::getPG
SamHeaderPG * getPG(const char *id)
Get the PG object with the specified id, returning NULL if there is no PG object with that key.
Definition: SamFileHeader.cpp:789
String
Definition: StringBasics.h:39
SamHeaderRecord::PG
@ PG
Program.
Definition: SamHeaderRecord.h:35
SamFileHeader::setHDTag
bool setHDTag(const char *tag, const char *value)
Set the specified tag to the specified value in the HD header, remove the tag by specifying value="".
Definition: SamFileHeader.cpp:225
SamFileHeader::resetRGRecordIter
void resetRGRecordIter()
Reset to the beginning of the header records so the next call to getNextRGRecord returns the first RG...
Definition: SamFileHeader.cpp:852
SamReferenceInfo
Class for tracking the reference information mapping between the reference ids and the reference name...
Definition: SamReferenceInfo.h:28
SamFileHeader::getSQTagValue
const char * getSQTagValue(const char *tag, const char *name)
Get the value associated with the specified tag on the SQ line with the specified sequence name,...
Definition: SamFileHeader.cpp:696
SamHeaderSQ
Definition: SamHeaderSQ.h:24
SamFileHeader::addSQ
bool addSQ(SamHeaderSQ *sq)
Add the SQ record to the header.
Definition: SamFileHeader.cpp:415
SamFileHeader::resetHeader
void resetHeader()
Initialize the header.
Definition: SamFileHeader.cpp:90
SamHeaderRG
Definition: SamHeaderRG.h:24
SamFileHeader::getNextComment
const char * getNextComment()
Returns the comment on the next comment line.
Definition: SamFileHeader.cpp:985
SamHeaderRecord::SQ
@ SQ
Sequence Dictionary.
Definition: SamHeaderRecord.h:33
SamFileHeader::getSQ
SamHeaderSQ * getSQ(const char *name)
Get the SQ object with the specified sequence name, returning NULL if there is no SQ object with that...
Definition: SamFileHeader.cpp:775
SamFileHeader::resetHeaderRecordIter
void resetHeaderRecordIter()
Reset to the beginning of the header records so the next call to getNextHeaderRecord returns the firs...
Definition: SamFileHeader.cpp:966
SamFileHeader::getHDTagValue
const char * getHDTagValue(const char *tag)
Returns the value associated with the specified HD tag, returning "" if the tag does not exist in the...
Definition: SamFileHeader.cpp:683
SamHeaderRecord::HD
@ HD
Header.
Definition: SamHeaderRecord.h:32
SamFileHeader::copy
bool copy(const SamFileHeader &header)
Copy method copies the passed in header into this header.
Definition: SamFileHeader.cpp:59
SamHeaderPG
Definition: SamHeaderPG.h:25
SamFileHeader::getNumPGs
int getNumPGs()
Get the number of PG objects.
Definition: SamFileHeader.cpp:761
SamFileHeader::getHeaderString
bool getHeaderString(std::string &header) const
Set the passed in string to the entire header string, clearing its current contents.
Definition: SamFileHeader.cpp:131
SamHeaderRecord::setFields
bool setFields(const StringArray &tokens)
Set the fields from the passed in line.
Definition: SamHeaderRecord.cpp:38
SamHeaderRecord::RG
@ RG
Read Group.
Definition: SamHeaderRecord.h:34
SamFileHeader::operator=
SamFileHeader & operator=(const SamFileHeader &header)
Overload operator = to copy the passed in header into this header.
Definition: SamFileHeader.cpp:52
SamFileHeader::addHeader
bool addHeader(const char *header)
Add a header that is already preformatted in a const char*.
Definition: SamFileHeader.cpp:204
SamFileHeader::addHD
bool addHD(SamHeaderHD *hd)
Add the HD record to the header.
Definition: SamFileHeader.cpp:393
SamFileHeader::addRecordCopy
bool addRecordCopy(const SamHeaderRecord &hdrRec)
Add a copy of the specified header record to the header.
Definition: SamFileHeader.cpp:526
SamFileHeader::removeSQ
bool removeSQ(const char *name)
Remove SQ record with the specified key.
Definition: SamFileHeader.cpp:573
SamFileHeader::getPGTagValue
const char * getPGTagValue(const char *tag, const char *id)
Get the value associated with the specified tag on the RG line with the specified id,...
Definition: SamFileHeader.cpp:730
SamReferenceInfo::getReferenceLabel
const String & getReferenceLabel(int id) const
Get the reference name for the specified id, if the id is not found, return "*".
Definition: SamReferenceInfo.cpp:80
SamReferenceInfo::getReferenceID
int getReferenceID(const String &referenceName, bool addID=false)
Get the reference ID for the specified name, if addID is set to true, a reference id will be created ...
Definition: SamReferenceInfo.cpp:45
SamFileHeader::addRG
bool addRG(SamHeaderRG *rg)
Add the RG record to the header.
Definition: SamFileHeader.cpp:458
SamFileHeader::setPGTag
bool setPGTag(const char *tag, const char *value, const char *id)
Set the specified tag to the specified value in the PG header with the specified id,...
Definition: SamFileHeader.cpp:353
SamFileHeader::resetSQRecordIter
void resetSQRecordIter()
Reset to the beginning of the header records so the next call to getNextSQRecord returns the first SQ...
Definition: SamFileHeader.cpp:844
SamHeaderRecord::addKey
bool addKey(const char *value)
Add the key tag with the specified value (not for HD headers).
Definition: SamHeaderRecord.cpp:273
SamFileHeader::appendCommentLines
void appendCommentLines(std::string &commentLines)
Append all of the comment lines to the specified string.
Definition: SamFileHeader.cpp:972
SamFileHeader::addPG
bool addPG(SamHeaderPG *pg)
Add the PG record to the header.
Definition: SamFileHeader.cpp:492
SamFileHeader::getReferenceLabel
const String & getReferenceLabel(int id) const
Return the reference name (chromosome) for the specified reference id.
Definition: SamFileHeader.cpp:158
SamFileHeader::setSQTag
bool setSQTag(const char *tag, const char *value, const char *name)
Set the specified tag to the specified value in the SQ header with the specified name,...
Definition: SamFileHeader.cpp:252
SamFileHeader::getNextPGRecord
SamHeaderRecord * getNextPGRecord()
Get the next PG header record.
Definition: SamFileHeader.cpp:835
SamFileHeader::getReferenceInfo
const SamReferenceInfo & getReferenceInfo() const
Get the Reference Information.
Definition: SamFileHeader.cpp:165
SamHeaderRecord::createCopy
virtual SamHeaderRecord * createCopy() const =0
Return a pointer to a newly created header record of the appropriate type that is a copy of this reco...
SamFileHeader
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:35
SamFileHeader::getSortOrder
const char * getSortOrder()
Return the Sort Order value that is set in the Header, returning "" if this field does not exist.
Definition: SamFileHeader.cpp:797
SamFileHeader::getNextRGRecord
SamHeaderRecord * getNextRGRecord()
Get the next RG header record.
Definition: SamFileHeader.cpp:826
SamFileHeader::getNextHeaderRecord
SamHeaderRecord * getNextHeaderRecord()
Get the next header record, but not comment line.
Definition: SamFileHeader.cpp:908
SamFileHeader::getRG
SamHeaderRG * getRG(const char *id)
Get the RG object with the specified read group identifier, returning NULL if there is no RG object w...
Definition: SamFileHeader.cpp:782
SamHeaderRecord::appendString
bool appendString(std::string &header)
Appends the string representation of this header record to the passed in string.
Definition: SamHeaderRecord.cpp:234
SamFileHeader::removeRG
bool removeRG(const char *id)
Remove RG record with the specified key.
Definition: SamFileHeader.cpp:610
SamFileHeader::getReferenceID
int getReferenceID(const String &referenceName, bool addID=false)
Get the reference ID for the specified reference name (chromosome).
Definition: SamFileHeader.cpp:146
SamHeaderRecord::SamHeaderRecordType
SamHeaderRecordType
Specifies the Type for the sam header record (line).
Definition: SamHeaderRecord.h:31
SamFileHeader::getNumSQs
int getNumSQs()
Get the number of SQ objects.
Definition: SamFileHeader.cpp:747
SamFileHeader::getNextHeaderLine
bool getNextHeaderLine(std::string &headerLine)
Set the passed in string to the next header line, overwritting the passed in string.
Definition: SamFileHeader.cpp:941
SamHeaderRecord::reset
void reset()
Reset this header record to an empty state with no tags.
Definition: SamHeaderRecord.cpp:212
SamHeaderRecord::getType
SamHeaderRecordType getType()
Return the type of this header record (HD, SQ, RG, or PG) as an enum.
Definition: SamHeaderRecord.cpp:315
SamHeaderRecord
This class encapsulates the tag value pairs contained with a SAM Header line with accessors for getti...
Definition: SamHeaderRecord.h:28
SamFileHeader::addHeaderLine
bool addHeaderLine(const char *type, const char *tag, const char *value)
Add a header line that is just one tag with a const char* value.
Definition: SamFileHeader.cpp:180
SamHeaderRecord::setTag
bool setTag(const char *tag, const char *value)
Set the value of the specified tag to the specified value, deletes the tag when value is NULL.
Definition: SamHeaderRecord.cpp:119
SamHeaderHD
Definition: SamHeaderHD.h:24
SamFileHeader::getNextSQRecord
SamHeaderRecord * getNextSQRecord()
Get the next SQ header record.
Definition: SamFileHeader.cpp:817
StringArray
Definition: StringArray.h:24
SamFileHeader::setRGTag
bool setRGTag(const char *tag, const char *value, const char *id)
Set the specified tag to the specified value in the RG header with the specified id,...
Definition: SamFileHeader.cpp:311
SamFileHeader::removeHD
bool removeHD()
Remove the HD record.
Definition: SamFileHeader.cpp:554
SamFileHeader::addComment
bool addComment(const char *comment)
Add the specified comment to the header (do not include "@CO" or "\n").
Definition: SamFileHeader.cpp:213
SamReferenceInfo::add
void add(const char *referenceSequenceName, int32_t referenceSequenceLength)
Add reference sequence name and reference sequence length.
Definition: SamReferenceInfo.cpp:35
SamFileHeader::getNumRGs
int getNumRGs()
Get the number of RG objects.
Definition: SamFileHeader.cpp:754