libStatGen Software  1
SamInterface.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "SamInterface.h"
19 #include "SamRecordHelper.h"
20 
21 #include <limits>
22 #include <stdint.h>
23 
24 SamInterface::SamInterface()
25 {
26 }
27 
28 
29 SamInterface::~SamInterface()
30 {
31 }
32 
33 
34 // Read a SAM file's header.
35 bool SamInterface::readHeader(IFILE filePtr, SamFileHeader& header,
36  SamStatus& status)
37 {
38  if(filePtr == NULL)
39  {
40  // File is not open.
42  "Cannot read header since the file pointer is null");
43  return(false);
44  }
45 
46  // Clear the passed in header.
47  header.resetHeader();
48 
49  int numValid = 0;
50  int numInvalid = 0;
51  std::string errorMessages = "";
52 
53  do {
54  StringIntHash tags;
55  StringArray values;
56  buffer.ReadLine(filePtr);
57 
58  // Stop reading header lines if at the end of the file or
59  // if the line is not blank and does not start with an @.
60  if ( ifeof(filePtr) ||
61  ((buffer.Length() != 0) && (buffer[0] != '@')) )
62  {
63  break;
64  }
65 
66  // This is a header line, so add it to header.
67  if(header.addHeaderLine(buffer.c_str()))
68  {
69  if(buffer.Length() != 0)
70  {
71  ++numValid;
72  }
73  }
74  else
75  {
76  ++numInvalid;
77  // Failed reading the header.
78  errorMessages += header.getErrorMessage();
79  // Skip further processing on this line since it was an error.
80  continue;
81  }
82  } while (1);
83 
84  // Store the first record since it was read.
85  myFirstRecord = buffer;
86 
87  if(numInvalid > 0)
88  {
89  if(numValid == 0)
90  {
91  std::cerr << "Failed to parse " << numInvalid << " header lines";
92  std::cerr << ". No valid header lines.\n";
93  status.setStatus(SamStatus::FAIL_PARSE, errorMessages.c_str());
94  return(false);
95  }
96  }
97 
98  // Successfully read.
99  return(true);
100 }
101 
102 bool SamInterface::writeHeader(IFILE filePtr, SamFileHeader& header,
103  SamStatus& status)
104 {
105  if((filePtr == NULL) || (filePtr->isOpen() == false))
106  {
107  // File is not open, return failure.
109  "Cannot write header since the file pointer is null");
110  return(false);
111  }
112 
113  ////////////////////////////////
114  // Write the header to the file.
115  ////////////////////////////////
116  // Construct a string containing the entire header.
117  std::string headerString = "";
118  header.getHeaderString(headerString);
119 
120  int32_t headerLen = headerString.length();
121  int numWrite = 0;
122 
123  // Write the header to the file.
124  numWrite = ifwrite(filePtr, headerString.c_str(), headerLen);
125  if(numWrite != headerLen)
126  {
128  "Failed to write the SAM header.");
129  return(false);
130  }
131  return(true);
132 }
133 
134 
135 void SamInterface::readRecord(IFILE filePtr, SamFileHeader& header,
136  SamRecord& record,
137  SamStatus& samStatus)
138 {
139  // Initialize the status to success - will be set to false on failure.
140  samStatus = SamStatus::SUCCESS;
141 
142  if((filePtr == NULL) || (filePtr->isOpen() == false))
143  {
144  // File is not open.
145  samStatus.addError(SamStatus::FAIL_ORDER,
146  "filePtr does not point to an open file.");
147  return;
148  }
149 
150  // If the first record has been set, use that and clear it,
151  // otherwise read the record from the file.
152  if(myFirstRecord.Length() != 0)
153  {
154  buffer = myFirstRecord;
155  myFirstRecord.Clear();
156  }
157  else
158  {
159  // Read the next record.
160  buffer.Clear();
161  buffer.ReadLine(filePtr);
162  // If the end of the file and nothing was read, return false.
163  if ((ifeof(filePtr)) && (buffer.Length() == 0))
164  {
165  // end of the file and nothing to process.
167  "No more records in the file.");
168  return;
169  }
170  }
171 
172  tokens.ReplaceColumns(buffer, '\t');
173 
174 
175  // Error string for reporting a parsing failure.
176  String errorString = "";
177 
178  if (tokens.Length() < 11)
179  {
180  errorString = "Too few columns (";
181  errorString += tokens.Length();
182  errorString += ") in the Record, expected at least 11.";
184  errorString.c_str());
185  return;
186  }
187 
188  // Reset the record before setting any fields.
189  record.resetRecord();
190 
191  if(!record.setReadName(tokens[0]))
192  {
193  samStatus.addError(record.getStatus());
194  }
195 
196  long flagInt = 0;
197  if(!tokens[1].AsInteger(flagInt))
198  {
199  errorString = "flag, ";
200  errorString += tokens[1].c_str();
201  errorString += ", is not an integer.";
203  errorString.c_str());
204  }
205  else if((flagInt < 0) || (flagInt > UINT16_MAX))
206  {
207  errorString = "flag, ";
208  errorString += tokens[1].c_str();
209  errorString += ", is not between 0 and (2^16)-1 = 65535.";
211  errorString.c_str());
212  }
213  else if(!record.setFlag(flagInt))
214  {
215  samStatus.addError(record.getStatus().getStatus(),
216  record.getStatus().getStatusMessage());
217  }
218 
219  if(!record.setReferenceName(header, tokens[2]))
220  {
221  samStatus.addError(record.getStatus().getStatus(),
222  record.getStatus().getStatusMessage());
223  }
224 
225  long posInt = 0;
226  if(!tokens[3].AsInteger(posInt))
227  {
228  errorString = "position, ";
229  errorString += tokens[3].c_str();
230  errorString += ", is not an integer.";
232  errorString.c_str());
233  }
234  else if((posInt < INT32_MIN) || (posInt > INT32_MAX))
235  {
236  // If it is not in this range, it cannot fit into a 32 bit int.
237  errorString = "position, ";
238  errorString += tokens[3].c_str();
239  errorString += ", does not fit in a 32 bit signed int.";
241  errorString.c_str());
242  }
243  else if(!record.set1BasedPosition(posInt))
244  {
245  samStatus.addError(record.getStatus().getStatus(),
246  record.getStatus().getStatusMessage());
247  }
248 
249  long mapInt = 0;
250  if(!tokens[4].AsInteger(mapInt))
251  {
252  errorString = "map quality, ";
253  errorString += tokens[4].c_str();
254  errorString += ", is not an integer.";
256  errorString.c_str());
257  }
258  else if((mapInt < 0) || (mapInt > UINT8_MAX))
259  {
260  errorString = "map quality, ";
261  errorString += tokens[4].c_str();
262  errorString += ", is not between 0 and (2^8)-1 = 255.";
264  errorString.c_str());
265  }
266  else if(!record.setMapQuality(mapInt))
267  {
268  samStatus.addError(record.getStatus().getStatus(),
269  record.getStatus().getStatusMessage());
270  }
271 
272  if(!record.setCigar(tokens[5]))
273  {
274  samStatus.addError(record.getStatus().getStatus(),
275  record.getStatus().getStatusMessage());
276  }
277 
278  if(!record.setMateReferenceName(header, tokens[6]))
279  {
280  samStatus.addError(record.getStatus().getStatus(),
281  record.getStatus().getStatusMessage());
282  }
283 
284  long matePosInt = 0;
285  if(!tokens[7].AsInteger(matePosInt))
286  {
287  errorString = "mate position, ";
288  errorString += tokens[7].c_str();
289  errorString += ", is not an integer.";
291  errorString.c_str());
292  }
293  else if(!record.set1BasedMatePosition(matePosInt))
294  {
295  samStatus.addError(record.getStatus().getStatus(),
296  record.getStatus().getStatusMessage());
297  }
298 
299  long insertInt = 0;
300  if(!tokens[8].AsInteger(insertInt))
301  {
302  errorString = "insert size, ";
303  errorString += tokens[8].c_str();
304  errorString += ", is not an integer.";
306  errorString.c_str());
307  }
308  else if(!record.setInsertSize(insertInt))
309  {
310  samStatus.addError(record.getStatus().getStatus(),
311  record.getStatus().getStatusMessage());
312  }
313 
314  if(!record.setSequence(tokens[9]))
315  {
316  samStatus.addError(record.getStatus().getStatus(),
317  record.getStatus().getStatusMessage());
318  }
319 
320  if(!record.setQuality(tokens[10]))
321  {
322  samStatus.addError(record.getStatus().getStatus(),
323  record.getStatus().getStatusMessage());
324  }
325 
326  // Clear the tag fields.
327  record.clearTags();
328 
329  // Add the tags to the record.
330  for (int i = 11; i < tokens.Length(); i++)
331  {
332  String & nugget = tokens[i];
333 
334  if (nugget.Length() < 6 || nugget[2] != ':' || nugget[4] != ':')
335  {
336  // invalid tag format.
337  errorString = "Invalid Tag Format: ";
338  errorString += nugget.c_str();
339  errorString += ", should be cc:c:x*.";
341  errorString.c_str());
342  continue;
343  }
344 
345  // Valid tag format.
346  // Add the tag.
347  if(!record.addTag((const char *)nugget, nugget[3],
348  (const char *)nugget + 5))
349  {
350  samStatus.addError(record.getStatus().getStatus(),
351  record.getStatus().getStatusMessage());
352  }
353  }
354 
355  return;
356 }
357 
358 
359 SamStatus::Status SamInterface::writeRecord(IFILE filePtr,
360  SamFileHeader& header,
361  SamRecord& record,
362  SamRecord::SequenceTranslation translation)
363 {
364  // Store all the fields into a string, then write the string.
365  String recordString = record.getReadName();
366  recordString += "\t";
367  recordString += record.getFlag();
368  recordString += "\t";
369  recordString += record.getReferenceName();
370  recordString += "\t";
371  recordString += record.get1BasedPosition();
372  recordString += "\t";
373  recordString += record.getMapQuality();
374  recordString += "\t";
375  recordString += record.getCigar();
376  recordString += "\t";
377  recordString += record.getMateReferenceNameOrEqual();
378  recordString += "\t";
379  recordString += record.get1BasedMatePosition();
380  recordString += "\t";
381  recordString += record.getInsertSize();
382  recordString += "\t";
383  recordString += record.getSequence(translation);
384  recordString += "\t";
385  recordString += record.getQuality();
386 
387  // If there are any tags, add a preceding tab.
388  if(record.getTagLength() != 0)
389  {
390  recordString += "\t";
391  SamRecordHelper::genSamTagsString(record, recordString);
392  }
393 
394  recordString += "\n";
395 
396 
397  // Write the record.
398  ifwrite(filePtr, recordString.c_str(), recordString.Length());
399  return(SamStatus::SUCCESS);
400 }
401 
402 
403 void SamInterface::ParseHeaderLine(StringIntHash & tags, StringArray & values)
404 {
405  tags.Clear();
406  values.Clear();
407 
408  tokens.AddColumns(buffer, '\t');
409 
410  for (int i = 1; i < tokens.Length(); i++)
411  {
412  tags.Add(tokens[i].Left(2), i - 1);
413  values.Push(tokens[i].SubStr(3));
414  }
415 }
416 
SamRecord::setMapQuality
bool setMapQuality(uint8_t mapQuality)
Set the mapping quality (MAPQ).
Definition: SamRecord.cpp:251
SamRecord::getReferenceName
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1286
SamRecord::SequenceTranslation
SequenceTranslation
Enum containing the settings on how to translate the sequence if a reference is available.
Definition: SamRecord.h:57
SamRecord::setSequence
bool setSequence(const char *seq)
Sets the sequence (SEQ) to the specified SAM formatted sequence string.
Definition: SamRecord.cpp:344
String
Definition: StringBasics.h:39
SamRecord::setMateReferenceName
bool setMateReferenceName(SamFileHeader &header, const char *mateReferenceName)
Set the mate/next fragment's reference sequence name (RNEXT) to the specified name,...
Definition: SamRecord.cpp:297
StatGenStatus::NO_MORE_RECS
@ NO_MORE_RECS
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...
Definition: StatGenStatus.h:36
SamRecord::getMapQuality
uint8_t getMapQuality()
Get the mapping quality (MAPQ) of the record.
Definition: SamRecord.cpp:1328
SamFileHeader::resetHeader
void resetHeader()
Initialize the header.
Definition: SamFileHeader.cpp:90
SamRecord::clearTags
void clearTags()
Clear the tags in this record.
Definition: SamRecord.cpp:965
StatGenStatus::SUCCESS
@ SUCCESS
method completed successfully.
Definition: StatGenStatus.h:32
StatGenStatus::getStatus
Status getStatus() const
Return the enum for this status object.
Definition: StatGenStatus.cpp:142
SamRecord::getFlag
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1372
InputFile::isOpen
bool isOpen() const
Returns whether or not the file was successfully opened.
Definition: InputFile.h:423
SamRecord::setReadName
bool setReadName(const char *readName)
Set QNAME to the passed in name.
Definition: SamRecord.cpp:193
SamFileHeader::getHeaderString
bool getHeaderString(std::string &header) const
Set the passed in string to the entire header string, clearing its current contents.
Definition: SamFileHeader.cpp:131
StatGenStatus
This class is used to track the status results of some methods in the BAM classes.
Definition: StatGenStatus.h:27
StatGenStatus::FAIL_PARSE
@ FAIL_PARSE
failed to parse a record/header - invalid format.
Definition: StatGenStatus.h:42
SamRecord::get1BasedPosition
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
Definition: SamRecord.cpp:1300
SamRecord::set1BasedPosition
bool set1BasedPosition(int32_t position)
Set the leftmost position (POS) using the specified 1-based (SAM format) value.
Definition: SamRecord.cpp:236
ifeof
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition: InputFile.h:654
SamRecord::getQuality
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
Definition: SamRecord.cpp:1626
StringIntHash
Definition: StringHash.h:194
StatGenStatus::setStatus
void setStatus(Status newStatus, const char *newMessage)
Set the status with the specified status enum and message.
Definition: StatGenStatus.cpp:83
SamRecord::getReadName
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
Definition: SamRecord.cpp:1530
StatGenStatus::Status
Status
Return value enum for StatGenFile methods.
Definition: StatGenStatus.h:32
SamRecord::getStatus
const SamStatus & getStatus()
Returns the status associated with the last method that sets the status.
Definition: SamRecord.cpp:2391
SamRecord::getTagLength
uint32_t getTagLength()
Returns the length of the BAM formatted tags.
Definition: SamRecord.cpp:1917
SamRecord::setInsertSize
bool setInsertSize(int32_t insertSize)
Sets the inferred insert size (ISIZE)/observed template length (TLEN).
Definition: SamRecord.cpp:336
SamRecord::setFlag
bool setFlag(uint16_t flag)
Set the bitwise FLAG to the specified value.
Definition: SamRecord.cpp:215
SamRecord::getSequence
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
Definition: SamRecord.cpp:1556
StatGenStatus::addError
void addError(Status newStatus, const char *newMessage)
Add the specified error message to the status message, setting the status to newStatus if the current...
Definition: StatGenStatus.cpp:99
StatGenStatus::getStatusMessage
const char * getStatusMessage() const
Return the status message for this object.
Definition: StatGenStatus.cpp:149
SamFileHeader
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:35
SamRecord::addTag
bool addTag(const char *tag, char vtype, const char *value)
Add the specified tag,vtype,value to the record.
Definition: SamRecord.cpp:779
SamRecord::getInsertSize
int32_t getInsertSize()
Get the inferred insert size of the read pair (ISIZE) or observed template length (TLEN).
Definition: SamRecord.cpp:1447
SamRecord::getCigar
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1543
SamRecord::setCigar
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
Definition: SamRecord.cpp:259
SamRecord
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition: SamRecord.h:52
SamRecord::get1BasedMatePosition
int32_t get1BasedMatePosition()
Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT).
Definition: SamRecord.cpp:1433
SamRecord::setReferenceName
bool setReferenceName(SamFileHeader &header, const char *referenceName)
Set the reference sequence name (RNAME) to the specified name, using the header to determine the refe...
Definition: SamRecord.cpp:223
SamRecord::resetRecord
void resetRecord()
Reset the fields of the record to a default value.
Definition: SamRecord.cpp:91
SamRecordHelper::genSamTagsString
static bool genSamTagsString(SamRecord &record, String &returnString, char delim='\t')
Helper to append the SAM string representation of all the tags to the specified string.
Definition: SamRecordHelper.cpp:56
SamRecord::getMateReferenceNameOrEqual
const char * getMateReferenceNameOrEqual()
Get the mate/next fragment's reference sequence name (RNEXT), returning "=" if it is the same as the ...
Definition: SamRecord.cpp:1408
InputFile
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
StatGenStatus::FAIL_IO
@ FAIL_IO
method failed due to an I/O issue.
Definition: StatGenStatus.h:37
StatGenStatus::FAIL_ORDER
@ FAIL_ORDER
FAIL_ORDER: method failed because it was called out of order, like trying to read a file without open...
Definition: StatGenStatus.h:41
SamFileHeader::addHeaderLine
bool addHeaderLine(const char *type, const char *tag, const char *value)
Add a header line that is just one tag with a const char* value.
Definition: SamFileHeader.cpp:180
ifwrite
unsigned int ifwrite(IFILE file, const void *buffer, unsigned int size)
Write the specified number of bytes from the specified buffer into the file.
Definition: InputFile.h:669
SamRecord::setQuality
bool setQuality(const char *quality)
Sets the quality (QUAL) to the specified SAM formatted quality string.
Definition: SamRecord.cpp:357
SamRecord::set1BasedMatePosition
bool set1BasedMatePosition(int32_t matePosition)
Set the mate/next fragment's leftmost position (PNEXT) using the specified 1-based (SAM format) value...
Definition: SamRecord.cpp:322
StringArray
Definition: StringArray.h:24
SamFileHeader::getErrorMessage
const char * getErrorMessage()
Get the failure message if a method returned failure.
Definition: SamFileHeader.h:423