19 #include "SamFileHeader.h"
20 #include "SamRecord.h"
21 #include "BamInterface.h"
22 #include "SamInterface.h"
23 #include "BgzfFileType.h"
36 : myStatus(errorHandlingType)
48 init(filename, mode, NULL);
56 : myStatus(errorHandlingType)
58 init(filename, mode, NULL);
67 init(filename, mode, header);
76 : myStatus(errorHandlingType)
78 init(filename, mode, header);
100 while (filename[lastchar] != 0) lastchar++;
103 if((lastchar >= 1) && (filename[0] ==
'-'))
107 if(strcmp(filename,
"-.bam") == 0)
123 ifread(myFilePtr, magic, 4);
125 else if(strcmp(filename,
"-.ubam") == 0)
135 #ifdef __ZLIB_AVAILABLE__
136 BgzfFileType::setRequireEofBlock(
false);
144 ifread(myFilePtr, magic, 4);
146 else if((strcmp(filename,
"-") == 0) || (strcmp(filename,
"-.sam") == 0))
156 std::string errorMessage =
"Invalid SAM/BAM filename, ";
157 errorMessage += filename;
158 errorMessage +=
". From stdin, can only be '-', '-.sam', '-.bam', or '-.ubam'";
178 std::string errorMessage =
"Failed to Open ";
179 errorMessage += filename;
180 errorMessage +=
" for reading";
188 ifread(myFilePtr, magic, 4);
190 if (magic[0] ==
'B' && magic[1] ==
'A' && magic[2] ==
'M' &&
229 while (filename[lastchar] != 0) lastchar++;
231 filename[lastchar - 4] ==
'u' &&
232 filename[lastchar - 3] ==
'b' &&
233 filename[lastchar - 2] ==
'a' &&
234 filename[lastchar - 1] ==
'm')
238 if((lastchar == 6) && (filename[0] ==
'-') && (filename[1] ==
'.'))
247 else if (lastchar >= 3 &&
248 filename[lastchar - 3] ==
'b' &&
249 filename[lastchar - 2] ==
'a' &&
250 filename[lastchar - 1] ==
'm')
254 if((lastchar == 5) && (filename[0] ==
'-') && (filename[1] ==
'.'))
267 if((lastchar >= 1) && (filename[0] ==
'-'))
276 if (myFilePtr == NULL)
278 std::string errorMessage =
"Failed to Open ";
279 errorMessage += filename;
280 errorMessage +=
" for writing";
303 if(myBamIndex != NULL)
315 std::string errorMessage =
"Failed to read the bam Index file: ";
316 errorMessage += bamIndexFilename;
330 if(myFilePtr == NULL)
334 std::string errorMessage =
"Failed to read the bam Index file -"
335 " the BAM file needs to be read first in order to determine"
336 " the index filename.";
341 const char* bamBaseName = myFilePtr->
getFileName();
343 std::string indexName = bamBaseName;
346 bool foundFile =
true;
354 catch (std::exception&)
364 size_t startExt = indexName.find(
".bam");
365 if(startExt == std::string::npos)
372 indexName.erase(startExt, 4);
382 myRefPtr = reference;
389 myReadTranslation = translation;
396 myWriteTranslation = translation;
412 if (myFilePtr != NULL)
415 return(myFilePtr->
isOpen());
426 if (myFilePtr != NULL)
429 return(
ifeof(myFilePtr));
444 "Cannot read header since the file is not open for reading");
452 "Cannot read header since it has already been read.");
456 if(myInterfacePtr->readHeader(myFilePtr, header,
myStatus))
476 "Cannot write header since the file is not open for writing");
484 "Cannot write header since it has already been written");
488 if(myInterfacePtr->writeHeader(myFilePtr, header,
myStatus))
510 "Cannot read record since the file is not open for reading");
511 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file."));
520 "Cannot read record since the header has not been read.");
521 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header."));
529 if(!processNewSection(header))
550 if(!ensureIndexedReadPosition())
560 myInterfacePtr->readRecord(myFilePtr, header, record,
myStatus);
572 if(!checkRecordInSection(record))
580 uint16_t flag = record.
getFlag();
581 if((flag & myRequiredFlags) != myRequiredFlags)
587 if((flag & myExcludedFlags) != 0)
626 "Cannot write record since the file is not open for writing");
634 "Cannot write record since the header has not been written");
643 "Cannot write the record since the file is not properly sorted.");
654 myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record,
671 mySortedType = sortType;
710 "Cannot set section since there is no bam file open");
715 myOverlapSection = overlap;
720 myChunksToRead.clear();
723 myCurrentChunkEnd = 0;
729 myPrevReadName.Clear();
746 "Cannot set section since there is no bam file open");
751 myOverlapSection = overlap;
754 if((strcmp(refName,
"") == 0) || (strcmp(refName,
"*") == 0))
766 myChunksToRead.clear();
769 myCurrentChunkEnd = 0;
775 myPrevReadName.Clear();
783 myRequiredFlags = requiredFlags;
784 myExcludedFlags = excludedFlags;
793 if(myBamIndex == NULL)
796 "Cannot get num mapped reads from the index until it has been read.");
808 if(myBamIndex == NULL)
811 "Cannot get num unmapped reads from the index until it has been read.");
824 if(myBamIndex == NULL)
827 "Cannot get num mapped reads from the index until it has been read.");
831 if((strcmp(refName,
"") != 0) && (strcmp(refName,
"*") != 0))
846 if(myBamIndex == NULL)
849 "Cannot get num unmapped reads from the index until it has been read.");
853 if((strcmp(refName,
"") != 0) && (strcmp(refName,
"*") != 0))
913 myInterfacePtr = NULL;
919 myAttemptRecovery =
false;
925 void SamFile::init(
const char* filename, OpenType mode,
SamFileHeader* header)
931 bool openStatus =
true;
946 std::cerr <<
"FAILURE - EXITING!!!" << std::endl;
956 if (myFilePtr != NULL)
962 if(myInterfacePtr != NULL)
964 delete myInterfacePtr;
965 myInterfacePtr = NULL;
972 myPrevReadName.Clear();
983 myNewSection =
false;
984 myOverlapSection =
true;
985 myCurrentChunkEnd = 0;
986 myChunksToRead.clear();
987 if(myBamIndex != NULL)
1008 if(myRefPtr != NULL)
1014 bool status =
false;
1023 if(mySortedType ==
FLAG)
1026 mySortedType = getSortOrderFromHeader(header);
1036 if((myPrevReadName.Compare(readName) > 0) &&
1037 (strcmp(myPrevReadName.c_str(), readName) > 0))
1040 String errorMessage =
"ERROR: File is not sorted by read name at record ";
1042 errorMessage +=
"\n\tPrevious record was ";
1043 errorMessage += myPrevReadName;
1044 errorMessage +=
", but this record is ";
1045 errorMessage += readName;
1047 errorMessage.c_str());
1052 myPrevReadName = readName;
1068 myPrevRefID = refID;
1075 String errorMessage =
"ERROR: File is not coordinate sorted at record ";
1077 errorMessage +=
"\n\tPrevious record was unmapped, but this record is ";
1080 errorMessage.c_str());
1083 else if(refID < myPrevRefID)
1087 String errorMessage =
"ERROR: File is not coordinate sorted at record ";
1089 errorMessage +=
"\n\tPrevious record was ";
1091 errorMessage +=
", but this record is ";
1094 errorMessage.c_str());
1100 if(refID > myPrevRefID)
1110 String errorMessage =
"ERROR: File is not coordinate sorted at record ";
1112 errorMessage +=
"\n\tPreviousRecord was ";
1114 errorMessage +=
", but this record is ";
1117 errorMessage.c_str());
1122 myPrevRefID = refID;
1141 if(strcmp(tag,
"queryname") == 0)
1145 else if(strcmp(tag,
"coordinate") == 0)
1149 return(headerSortOrder);
1153 bool SamFile::ensureIndexedReadPosition()
1165 uint64_t currentPos =
iftell(myFilePtr);
1166 if(currentPos >= myCurrentChunkEnd)
1169 if(myChunksToRead.empty())
1176 Chunk newChunk = myChunksToRead.pop();
1180 if(newChunk.chunk_beg != currentPos)
1183 if(
ifseek(myFilePtr, newChunk.chunk_beg, SEEK_SET) !=
true)
1187 "Failed to seek to the next record");
1192 myCurrentChunkEnd = newChunk.chunk_end;
1198 bool SamFile::checkRecordInSection(
SamRecord& record)
1200 bool recordFound =
true;
1235 recordFound =
false;
1238 if(!myOverlapSection)
1247 ((myEndPos != -1) &&
1252 recordFound =
false;
1256 return(recordFound);
1262 myNewSection =
false;
1265 if(myBamIndex == NULL)
1269 "Cannot read section since there is no index file open");
1270 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a BAM record by section prior to opening the BAM Index file."));
1279 "Cannot read section since there is no bam file open");
1280 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a BAM record by section without opening a BAM file."));
1288 "Cannot read record since the header has not been read.");
1289 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a BAM record by section prior to opening the header."));
1299 myChunksToRead.clear();
1302 myCurrentChunkEnd = 0;
1306 if(!myRefName.empty())
1322 myChunksToRead) ==
true)
1328 String errorMsg =
"Failed to get the specified region, refID = ";
1329 errorMsg += myRefID;
1330 errorMsg +=
"; startPos = ";
1331 errorMsg += myStartPos;
1332 errorMsg +=
"; endPos = ";
1333 errorMsg += myEndPos;
1352 bool SamFile::attemptRecoverySync(
bool (*checkSignature)(
void *data) ,
int length)
1354 if(myFilePtr==NULL)
return false;
1356 return myFilePtr->attemptRecoverySync(checkSignature, length);
1378 :
SamFile(filename, READ, errorHandlingType)
1386 :
SamFile(filename, READ, header)
1395 :
SamFile(filename, READ, errorHandlingType, header)
1400 SamFileReader::~SamFileReader()
1422 :
SamFile(filename, WRITE, errorHandlingType)
1430 :
SamFile(filename, WRITE, header)
1439 :
SamFile(filename, WRITE, errorHandlingType, header)
1444 SamFileWriter::~SamFileWriter()
bool myIsOpenForWrite
Flag to indicate if a file is open for writing.
void setSequenceTranslation(SequenceTranslation translation)
Set the type of sequence translation to use when getting the sequence.
@ COORDINATE
file is sorted by coordinate.
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
@ FLAG
SO flag from the header indicates the sort type.
bool IsEOF()
Returns whether or not the end of the file has been reached.
SequenceTranslation
Enum containing the settings on how to translate the sequence if a reference is available.
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
bool ReadBamIndex()
Read the bam index file using the BAM filename as a base.
int32_t myPrevCoord
Previous values used for checking if the file is sorted.
bool WriteRecord(SamFileHeader &header, SamRecord &record)
Writes the specified record into the file.
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
bool getChunksForRegion(int32_t refID, int32_t start, int32_t end, SortedChunkList &chunkList)
Get the list of chunks associated with this region.
const char * GetStatusMessage()
Get the Status Message of the last call that sets status.
bool myIsOpenForRead
Flag to indicate if a file is open for reading.
@ NO_MORE_RECS
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...
void resetFile()
Resets the file prepping for a new file.
int32_t getNumUnMappedReadsFromIndex(int32_t refID)
Get the number of unmapped reads in the specified reference id.
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
bool IsOpen()
Returns whether or not the file has been opened successfully.
int32_t getNumMappedReads(int32_t refID)
Get the number of mapped reads for this reference id.
static const int NO_REF_ID
Constant for the value returned if a reference id does not exist for a queried reference name.
bool ReadRecord(SamFileHeader &header, SamRecord &record)
Reads the next record from the file & stores it in the passed in record.
@ QUERY_NAME
file is sorted by queryname.
@ SUCCESS
method completed successfully.
SamFileReader()
Default Constructor.
OpenType
Enum for indicating whether to open the file for read or write.
void GenerateStatistics(bool genStats)
Whether or not statistics should be generated for this file.
int32_t getNumMappedReadsFromIndex(int32_t refID)
Get the number of mapped reads in the specified reference id.
HandlingType
This specifies how this class should respond to errors.
SortedType
Enum for indicating the type of sort expected in the file.
uint16_t getFlag()
Get the flag (FLAG).
SamStatistics * myStatistics
Pointer to the statistics for this file.
bool myHasHeader
Flag to indicate if a header has been read/written - required before being able to read/write a recor...
void SetReadFlags(uint16_t requiredFlags, uint16_t excludedFlags)
Specify which reads should be returned by ReadRecord.
static const int32_t REF_ID_ALL
The number used to indicate that all reference ids should be used.
@ FAIL_PARSE
failed to parse a record/header - invalid format.
bool OpenForWrite(const char *filename, SamFileHeader *header=NULL)
Open a sam/bam file for writing with the specified filename, determining SAM/BAM from the extension (...
@ INVALID_SORT
record is invalid due to it not being sorted.
int32_t getNumUnMappedReads(int32_t refID)
Get the number of unmapped reads for this reference id.
SamFile()
Default Constructor, initializes the variables, but does not open any files.
void setStatus(Status newStatus, const char *newMessage)
Set the status with the specified status enum and message.
uint32_t myRecordCount
Keep a count of the number of records that have been read/written so far.
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
Status
Return value enum for StatGenFile methods.
bool myIsBamOpenForRead
Values for reading Sorted BAM files via the index.
bool SetReadSection(int32_t refID)
Sets which reference id (index into the BAM list of reference information) of the BAM file should be ...
@ UNSORTED
file is not sorted.
void setSortedValidation(SortedType sortType)
Set the flag to validate that the file is sorted as it is read/written.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
virtual ~SamFile()
Destructor.
bool OpenForRead(const char *filename, SamFileHeader *header=NULL)
Open a sam/bam file for reading with the specified filename, determing the type of file and SAM/BAM b...
bool WriteHeader(SamFileHeader &header)
Writes the specified header into the file.
uint32_t GetNumOverlaps(SamRecord &samRecord)
Returns the number of bases in the passed in read that overlap the region that is currently set.
@ FAIL_IO
method failed due to an I/O issue.
uint32_t getNumOverlaps(int32_t start, int32_t end)
Return the number of bases in this read that overlap the passed in region.
uint32_t GetCurrentRecordCount()
Return the number of records that have been read/written so far.
@ FAIL_ORDER
FAIL_ORDER: method failed because it was called out of order, like trying to read a file without open...
SamFileWriter()
Default Constructor.
bool ReadHeader(SamFileHeader &header)
Reads the header section from the file and stores it in the passed in header.
@ NONE
Leave the sequence as is.
void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation)
Set the type of sequence translation to use when reading the sequence.
const BamIndex * GetBamIndex()
Return the bam index if one has been opened.
void Close()
Close the file if there is one open.
bool validateSortOrder(SamRecord &record, SamFileHeader &header)
Validate that the record is sorted compared to the previously read record if there is one,...
void setReference(GenomeSequence *reference)
Set the reference to the specified genome sequence object.
Allows the user to easily read/write a SAM/BAM file.
void SetReference(GenomeSequence *reference)
Sets the reference to the specified genome sequence object.
SamStatus::Status readIndex(const char *filename)
void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation)
Set the type of sequence translation to use when writing the sequence.
SamStatus myStatus
The status of the last SamFile command.
static const int32_t REF_ID_UNMAPPED
The number used for the reference id of unmapped reads.