libStatGen Software  1
FastQFile.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <iostream>
19 
20 #include "InputFile.h"
21 #include "FastQFile.h"
22 #include "BaseUtilities.h"
23 
24 // Constructor.
25 // minReadLength - The minimum length that a base sequence must be for
26 // it to be valid.
27 // numPrintableErrors - The maximum number of errors that should be reported
28 // in detail before suppressing the errors.
29 //
30 FastQFile::FastQFile(int minReadLength, int numPrintableErrors)
31  : myFile(NULL),
32  myBaseComposition(),
33  myQualPerCycle(),
34  myCountPerCycle(),
35  myCheckSeqID(true),
36  myMinReadLength(minReadLength),
37  myNumPrintableErrors(numPrintableErrors),
38  myMaxErrors(-1),
39  myDisableMessages(false),
40  myFileProblem(false)
41 {
42  // Reset the member data.
43  reset();
44 }
45 
46 
48 {
49  myDisableMessages = true;
50 }
51 
52 
54 {
55  myDisableMessages = false;
56 }
57 
58 
59 // Disable Unique Sequence ID checking.
60 // Unique Sequence ID checking is enabled by default.
62 {
63  myCheckSeqID = false;
64 }
65 
66 
67 // Enable Unique Sequence ID checking.
68 // Unique Sequence ID checking is enabled by default.
70 {
71  myCheckSeqID = true;
72 }
73 
74 
75 // Set the number of errors after which to quit reading/validating a file.
76 void FastQFile::setMaxErrors(int maxErrors)
77 {
78  myMaxErrors = maxErrors;
79 }
80 
81 
82 // Open a FastQFile.
84  BaseAsciiMap::SPACE_TYPE spaceType)
85 {
86  // reset the member data.
87  reset();
88 
89  myBaseComposition.resetBaseMapType();
90  myBaseComposition.setBaseMapType(spaceType);
91  myQualPerCycle.clear();
92  myCountPerCycle.clear();
93 
95 
96  // Close the file if there is already one open - checked by close.
97  status = closeFile();
98  if(status == FastQStatus::FASTQ_SUCCESS)
99  {
100  // Successfully closed a previously opened file if there was one.
101 
102  // Open the file
103  myFile = ifopen(fileName, "rt");
104  myFileName = fileName;
105 
106  if(myFile == NULL)
107  {
108  // Failed to open the file.
110  }
111  }
112 
113  if(status != FastQStatus::FASTQ_SUCCESS)
114  {
115  // Failed to open the file.
116  std::string errorMessage = "ERROR: Failed to open file: ";
117  errorMessage += fileName;
118  logMessage(errorMessage.c_str());
119  }
120  return(status);
121 }
122 
123 
124 // Close a FastQFile.
126 {
127  int closeStatus = 0; // Success.
128 
129  // If a file has been opened, close it.
130  if(myFile != NULL)
131  {
132  // Close the file.
133  closeStatus = ifclose(myFile);
134  myFile = NULL;
135  }
136  if(closeStatus == 0)
137  {
138  // Success - either there wasn't a file to close or it was closed
139  // successfully.
141  }
142  else
143  {
144  std::string errorMessage = "Failed to close file: ";
145  errorMessage += myFileName.c_str();
146  logMessage(errorMessage.c_str());
148  }
149 }
150 
151 
152 // Check to see if the file is open.
154 {
155  // Check to see if the file is open.
156  if((myFile != NULL) && (myFile->isOpen()))
157  {
158  // File pointer exists and the file is open.
159  return true;
160  }
161 
162  // File is not open.
163  return false;
164 }
165 
166 
167 // Check to see if the file is at the end of the file.
169 {
170  // Check to see if the file is open.
171  if((myFile != NULL) && (ifeof(myFile)))
172  {
173  // At EOF.
174  return true;
175  }
176 
177  // Not at EOF.
178  return false;
179 }
180 
181 
182 // Returns whether or not to keep reading the file.
183 // Stop reading (false) if eof or there is a problem reading the file.
185 {
186  if(isEof() || myFileProblem)
187  {
188  return(false);
189  }
190  return(true);
191 }
192 
193 
194 // Validate the specified fastq file
196  bool printBaseComp,
197  BaseAsciiMap::SPACE_TYPE spaceType,
198  bool printQualAvg)
199 {
200  // Open the fastqfile.
201  if(openFile(filename, spaceType) != FastQStatus::FASTQ_SUCCESS)
202  {
203  // Failed to open the specified file.
205  }
206 
207  // Track the total number of sequences that were validated.
208  int numSequences = 0;
209 
210  // Keep reading the file until there are no more fastq sequences to process
211  // and not configured to quit after a certain number of errors or there
212  // has not yet been that many errors.
213  // Or exit if there is a problem reading the file.
215  while (keepReadingFile() &&
216  ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
217  {
218  // Validate one sequence. This call will read all the lines for
219  // one sequence.
220  status = readFastQSequence();
221  if((status == FastQStatus::FASTQ_SUCCESS) || (status == FastQStatus::FASTQ_INVALID))
222  {
223  // Read a sequence and it is either valid or invalid, but
224  // either way, a sequence was read, so increment the sequence count.
225  ++numSequences;
226  }
227  else
228  {
229  // Other error, so break out of processing.
230  break;
231  }
232  }
233 
234  // Report Base Composition Statistics.
235  if(printBaseComp)
236  {
237  myBaseComposition.print();
238  }
239 
240  if(printQualAvg)
241  {
242  printAvgQual();
243  }
244 
245  std::string finishMessage = "Finished processing ";
246  finishMessage += myFileName.c_str();
247  char buffer[100];
248  if(sprintf(buffer,
249  " with %u lines containing %d sequences.",
250  myLineNum, numSequences) > 0)
251  {
252  finishMessage += buffer;
253  logMessage(finishMessage.c_str());
254  }
255  if(sprintf(buffer,
256  "There were a total of %d errors.",
257  myNumErrors) > 0)
258  {
259  logMessage(buffer);
260  }
261 
262  // Close the input file.
263  FastQStatus::Status closeStatus = closeFile();
264 
265  if((status != FastQStatus::FASTQ_SUCCESS) && (status != FastQStatus::FASTQ_INVALID))
266  {
267  // Stopped validating due to some error other than invalid, so
268  // return that error.
269  return(status);
270  }
271  else if(myNumErrors == 0)
272  {
273  // No errors, check to see if there were any sequences.
274  // Finished processing all of the sequences in the file.
275  // If there are no sequences, report an error.
276  if(numSequences == 0)
277  {
278  // Empty file, return error.
279  logMessage("ERROR: No FastQSequences in the file.");
281  }
283  }
284  else
285  {
286  // The file is invalid. But check the close status. If the close
287  // failed, it means there is a problem with the file itself not just
288  // with validation, so the close failure should be returned.
289  if(closeStatus != FastQStatus::FASTQ_SUCCESS)
290  {
291  return(closeStatus);
292  }
294  }
295 }
296 
297 
298 // Reads and validates a single fastq sequence from myFile.
300 {
301  // First verify that a file is open, if not, return failure.
302  if(!isOpen())
303  {
304  std::string message =
305  "ERROR: Trying to read a fastq file but no file is open.";
306  logMessage(message.c_str());
308  }
309 
310  // Reset variables for each sequence.
311  resetForEachSequence();
312 
313  bool valid = true;
314 
315  // No sequence was read.
316  if(isTimeToQuit())
317  {
319  }
320 
321  // The first line is the sequence identifier, so validate that.
322  valid = validateSequenceIdentifierLine();
323 
324  if(myFileProblem)
325  {
327  }
328 
329  // If we are at the end of the file, check to see if it is a partial
330  // sequence or just an empty line at the end.
331  if(ifeof(myFile))
332  {
333  // If the sequence identifier line was empty and we are at the
334  // end of the file, there is nothing more to validate.
335  if(mySequenceIdLine.Length() != 0)
336  {
337  // There was a sequence identifier line, so this is an incomplete
338  // sequence.
339  myErrorString = "Incomplete Sequence.\n";
340  reportErrorOnLine();
341 
342  valid = false;
343  }
344  if(valid)
345  {
346  // Return failure - no sequences were left to read. At the end
347  // of the file. It wasn't invalid and it wasn't really an error.
349  }
350  else
351  {
353  }
354  }
355 
356  // If enough errors, quit before reading any more.
357  if(isTimeToQuit())
358  {
359  // Means there was an error, so mark it as invalid.
361  }
362 
363  // Validate the Raw Sequence Line(s) and the "+" line.
364  valid &= validateRawSequenceAndPlusLines();
365 
366  if(myFileProblem)
367  {
369  }
370 
371  // If enough errors, quit before reading any more.
372  if(isTimeToQuit())
373  {
375  }
376 
377  // If it is the end of a file, it is missing the quality string.
378  if(ifeof(myFile))
379  {
380  // There was a sequence identifier line, so this is an incomplete
381  // sequence.
382  myErrorString = "Incomplete Sequence, missing Quality String.";
383  reportErrorOnLine();
384  valid = false;
386  }
387 
388  // All that is left is to validate the quality string line(s).
389  valid &= validateQualityStringLines();
390 
391  if(myFileProblem)
392  {
394  }
395 
396  if(valid)
397  {
399  }
401 }
402 
403 
404 // Reads and validates the sequence identifier line of a fastq sequence.
405 bool FastQFile::validateSequenceIdentifierLine()
406 {
407  // Read the first line of the sequence.
408  int readStatus = mySequenceIdLine.ReadLine(myFile);
409 
410  // Check to see if the read was successful.
411  if(readStatus <= 0)
412  {
413  // If EOF, not an error.
414  if(ifeof(myFile))
415  {
416  return true;
417  }
418  myFileProblem = true;
419  myErrorString = "Failure trying to read sequence identifier line";
420  reportErrorOnLine();
421  return false;
422  }
423 
424  // If the line is 0 length and it is the end of the file, just
425  // return since this is the eof - no error.
426  if((mySequenceIdLine.Length() == 0) && (ifeof(myFile)))
427  {
428  // Not an error, just a new line at the end of the file.
429  return true;
430  }
431 
432  // Increment the line number.
433  myLineNum++;
434 
435  // Verify that the line has at least 2 characters: '@' and at least
436  // one character for the sequence identifier.
437  if(mySequenceIdLine.Length() < 2)
438  {
439  // Error. Sequence Identifier line not long enough.
440  myErrorString = "The sequence identifier line was too short.";
441  reportErrorOnLine();
442  return false;
443  }
444 
445  // The sequence identifier line must start wtih a '@'
446  if(mySequenceIdLine[0] != '@')
447  {
448  // Error - sequence identifier line does not begin with an '@'.
449  myErrorString = "First line of a sequence does not begin with @";
450  reportErrorOnLine();
451  return false;
452  }
453 
454  // Valid Sequence Identifier Line.
455 
456  // The sequence identifier ends at the first space or at the end of the
457  // line if there is no space.
458  // Use fast find since this is a case insensitive search.
459  // Start at 1 since we know that 0 is '@'
460  int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1);
461 
462  // Check if a " " was found.
463  if(endSequenceIdentifier == -1)
464  {
465  // Did not find a ' ', so the identifier is the rest of the line.
466  // It starts at 1 since @ is at offset 0.
467  mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
468  }
469  else
470  {
471  // Found a ' ', so the identifier ends just before that.
472  // The sequence identifier must be at least 1 character long,
473  // therefore the endSequenceIdentifier must be greater than 1.
474  if(endSequenceIdentifier <= 1)
475  {
476  myErrorString =
477  "No Sequence Identifier specified before the comment.";
478  reportErrorOnLine();
479  return false;
480  }
481 
482  mySequenceIdentifier =
483  (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
484  }
485 
486  // Check if sequence identifier should be validated for uniqueness.
487  if(myCheckSeqID)
488  {
489  // Check to see if the sequenceIdentifier is a repeat by adding
490  // it to the set and seeing if it already existed.
491  std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
492  insertResult =
493  myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
494  myLineNum));
495 
496  if(insertResult.second == false)
497  {
498  // Sequence Identifier is a repeat.
499  myErrorString = "Repeated Sequence Identifier: ";
500  myErrorString += mySequenceIdentifier.c_str();
501  myErrorString += " at Lines ";
502  myErrorString += insertResult.first->second;
503  myErrorString += " and ";
504  myErrorString += myLineNum;
505  reportErrorOnLine();
506  return(false);
507  }
508  }
509 
510  // Valid, return true.
511  return(true);
512 }
513 
514 
515 // Reads and validates the raw sequence line(s) and the plus line. Both are
516 // included in one method since it is unknown when the raw sequence line
517 // ends until you find the plus line that divides it from the quality
518 // string. Since this method will read the plus line to know when the
519 // raw sequence ends, it also validates that line.
520 bool FastQFile::validateRawSequenceAndPlusLines()
521 {
522  // Read the raw sequence.
523  int readStatus = myRawSequence.ReadLine(myFile);
524 
525  myLineNum++;
526 
527  if(readStatus <= 0)
528  {
529  myFileProblem = true;
530  myErrorString = "Failure trying to read sequence line";
531  reportErrorOnLine();
532  return false;
533  }
534 
535  // Offset into the raw sequence to be validated.
536  int offset = 0;
537 
538  // Validate the raw sequence.
539  bool valid = validateRawSequence(offset);
540 
541  // Increment the offset for what was just read.
542  offset = myRawSequence.Length();
543 
544  // The next line is either a continuation of the raw sequence or it starts
545  // with a '+'
546  // Keep validating til the '+' line or the end of file is found.
547  bool stillRawLine = true;
548  while(stillRawLine &&
549  !ifeof(myFile))
550  {
551  // If enough errors, quit before reading any more.
552  if(isTimeToQuit())
553  {
554  return(false);
555  }
556 
557  // Read the next line.
558  // Read into the plus line, but if it isn't a plus line, then
559  // it will be copied into the raw sequence line.
560  readStatus = myPlusLine.ReadLine(myFile);
561  myLineNum++;
562 
563  if(readStatus <= 0)
564  {
565  myFileProblem = true;
566  myErrorString = "Failure trying to read sequence/plus line";
567  reportErrorOnLine();
568  return false;
569  }
570 
571  // Check if the next line is blank
572  if(myPlusLine.Length() == 0)
573  {
574  // The next line is blank. Assume it is part of the raw sequence and
575  // report an error since there are no valid characters on the line.
576  myErrorString =
577  "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
578  reportErrorOnLine();
579  }
580  // Check for the plus line.
581  else if(myPlusLine[0] == '+')
582  {
583  // This is the + line.
584  valid &= validateSequencePlus();
585  stillRawLine = false;
586  }
587  else
588  {
589  // Not a plus line, so assume this is a continuation of the Raw
590  // Sequence.
591  // Copy from the plus line to the raw sequence line.
592  myRawSequence += myPlusLine;
593  myPlusLine.SetLength(0);
594  valid &= validateRawSequence(offset);
595 
596  // Increment the offset.
597  offset = myRawSequence.Length();
598  }
599  }
600 
601  // If enough errors, quit before reading any more.
602  if(isTimeToQuit())
603  {
604  return(false);
605  }
606 
607  // Now that the entire raw sequence has been obtained, check its length
608  // against the minimum allowed length.
609  if(myRawSequence.Length() < myMinReadLength)
610  {
611  // Raw sequence is not long enough - error.
612  myErrorString = "Raw Sequence is shorter than the min read length: ";
613  myErrorString += myRawSequence.Length();
614  myErrorString += " < ";
615  myErrorString += myMinReadLength;
616  reportErrorOnLine();
617  valid = false;
618  }
619 
620  // If enough errors, quit before reading any more.
621  if(isTimeToQuit())
622  {
623  return(false);
624  }
625 
626  // if the flag still indicates it is processing the raw sequence that means
627  // we reached the end of the file without a '+' line. So report that error.
628  if(stillRawLine)
629  {
630  myErrorString = "Reached the end of the file without a '+' line.";
631  reportErrorOnLine();
632  valid = false;
633  }
634 
635  return(valid);
636 }
637 
638 
639 // Reads and validates the quality string line(s).
640 bool FastQFile::validateQualityStringLines()
641 {
642  // Read the quality string.
643  int readStatus = myQualityString.ReadLine(myFile);
644  myLineNum++;
645 
646  if(readStatus <= 0)
647  {
648  myFileProblem = true;
649  myErrorString = "Failure trying to read quality line";
650  reportErrorOnLine();
651  return false;
652  }
653 
654  // track the offset into the quality string to validate.
655  int offset = 0;
656 
657  // Validate this line of the quality string.
658  bool valid = validateQualityString(offset);
659 
660  offset = myQualityString.Length();
661 
662  // Keep reading quality string lines until the length of the
663  // raw sequence has been hit or the end of the file is reached.
664  while((myQualityString.Length() < myRawSequence.Length()) &&
665  (!ifeof(myFile)))
666  {
667  // If enough errors, quit before reading any more.
668  if(isTimeToQuit())
669  {
670  return(false);
671  }
672 
673  // Read another line of the quality string.
674  readStatus = myTempPartialQuality.ReadLine(myFile);
675  myLineNum++;
676 
677  if(readStatus <= 0)
678  {
679  myFileProblem = true;
680  myErrorString = "Failure trying to read quality line";
681  reportErrorOnLine();
682  return false;
683  }
684 
685  myQualityString += myTempPartialQuality;
686  myTempPartialQuality.Clear();
687 
688  // Validate this line of the quality string.
689  valid &= validateQualityString(offset);
690  offset = myQualityString.Length();
691  }
692 
693  // If enough errors, quit before reading any more.
694  if(isTimeToQuit())
695  {
696  return(false);
697  }
698 
699  // Validate that the quality string length is the same as the
700  // raw sequence length.
701  if(myQualityString.Length() != myRawSequence.Length())
702  {
703  myErrorString = "Quality string length (";
704  myErrorString += myQualityString.Length();
705  myErrorString += ") does not equal raw sequence length (";
706  myErrorString += myRawSequence.Length();
707  myErrorString += ")";
708  reportErrorOnLine();
709  valid = false;
710  }
711  return(valid);
712 }
713 
714 
715 // Method to validate a line that contains part of the raw sequence.
716 bool FastQFile::validateRawSequence(int offset)
717 {
718  bool validBase = false;
719  bool valid = true;
720 
721  // Loop through validating each character is valid for the raw sequence.
722  for(int sequenceIndex = offset; sequenceIndex < myRawSequence.Length();
723  sequenceIndex++)
724  {
725  // Update the composition for this position. Returns false if the
726  // character was not a valid base.
727  validBase =
728  myBaseComposition.updateComposition(sequenceIndex,
729  myRawSequence[sequenceIndex]);
730  // Check the return
731  if(!validBase)
732  {
733  // Error, found a value that is not a valid base character.
734  myErrorString = "Invalid character ('";
735  myErrorString += myRawSequence[sequenceIndex];
736  myErrorString += "') in base sequence.";
737  reportErrorOnLine();
738  valid = false;
739  // If enough errors, quit before reading any more.
740  if(isTimeToQuit())
741  {
742  return(false);
743  }
744  }
745  }
746  return(valid);
747 }
748 
749 
750 // Method to validate the "+" line that seperates the raw sequence and the
751 // quality string.
752 bool FastQFile::validateSequencePlus()
753 {
754  // Validate that optional sequence identifier is the same
755  // as the one on the @ line.
756 
757  // Check to see if there is more to the line than just the plus
758  int lineLength = myPlusLine.Length();
759 
760  // If the line is only 1 character or the second character is a space,
761  // then there is no sequence identifier on this line and there is nothing
762  // further to validate.
763  if((lineLength == 1) || (myPlusLine[1] == ' '))
764  {
765  // No sequence identifier, so just return valid.
766  return true;
767  }
768 
769  // There is a sequence identifier on this line, so validate that
770  // it matches the one from the associated @ line.
771  // The read in line must be at least 1 more character ('+') than the
772  // sequence identifier read from the '@' line.
773  // If it is not longer than the sequence identifier, then we know that it
774  // cannot be the same.
775  int sequenceIdentifierLength = mySequenceIdentifier.Length();
776  if(lineLength <= sequenceIdentifierLength)
777  {
778  myErrorString =
779  "Sequence Identifier on '+' line does not equal the one on the '@' line.";
780  reportErrorOnLine();
781  return false;
782  }
783 
784  bool same = true;
785  int seqIndex = 0;
786  int lineIndex = 1; // Start at 1 since offset 0 has '+'
787 
788  // Loop through the sequence index and the line buffer verifying they
789  // are the same until a difference is found or the end of the sequence
790  // identifier is found.
791  while((same == true) && (seqIndex < sequenceIdentifierLength))
792  {
793  if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
794  {
795  myErrorString =
796  "Sequence Identifier on '+' line does not equal the one on the '@' line.";
797  reportErrorOnLine();
798  same = false;
799  }
800  lineIndex++;
801  seqIndex++;
802  }
803  return(same);
804 }
805 
806 
807 // Method to validate the quality string.
808 bool FastQFile::validateQualityString(int offset)
809 {
810  bool valid = true;
811  if(myQualityString.Length() > (int)(myQualPerCycle.size()))
812  {
813  myQualPerCycle.resize(myQualityString.Length());
814  myCountPerCycle.resize(myQualityString.Length());
815  }
816  // For each character in the line, verify that it is ascii > 32.
817  for(int i=offset; i < myQualityString.Length(); i++)
818  {
819  if(myQualityString[i] <= 32)
820  {
821  myErrorString = "Invalid character ('";
822  myErrorString += myQualityString[i];
823  myErrorString += "') in quality string.";
824  reportErrorOnLine();
825  valid = false;
826  // If enough errors, quit before reading any more.
827  if(isTimeToQuit())
828  {
829  return(false);
830  }
831  }
832  else
833  {
834  myQualPerCycle[i] += BaseUtilities::getPhredBaseQuality(myQualityString[i]);
835  myCountPerCycle[i] += 1;
836  }
837  }
838  return(valid);
839 }
840 
841 
842 // Helper method for printing the contents of myErrorString. It will
843 // only print the errors until the maximum number of reportable errors is
844 // reached.
845 void FastQFile::reportErrorOnLine()
846 {
847  // Increment the total number of errors.
848  myNumErrors++;
849 
850  // Only display the first X number of errors.
851  if(myNumErrors <= myNumPrintableErrors)
852  {
853  // Write the error with the line number.
854  char buffer[100];
855  sprintf(buffer, "ERROR on Line %u: ", myLineNum);
856  std::string message = buffer;
857  message += myErrorString.c_str();
858  logMessage(message.c_str());
859  }
860 }
861 
862 
863 // Reset member data that is unique for each fastQFile.
864 void FastQFile::reset()
865 {
866  // Each fastq file processing needs to also reset the member data for each
867  // sequence.
868  resetForEachSequence();
869  myNumErrors = 0; // per fastqfile
870  myLineNum = 0; // per fastqfile
871  myFileName.SetLength(0); // reset the filename string.
872  myIdentifierMap.clear(); // per fastqfile
873  myBaseComposition.clear(); // clear the base composition.
874  myQualPerCycle.clear();
875  myCountPerCycle.clear();
876  myFileProblem = false;
877 }
878 
879 
880 // Reset the member data that is unique for each sequence.
881 void FastQFile::resetForEachSequence()
882 {
883  myLineBuffer.SetLength(0);
884  myErrorString.SetLength(0);
885  myRawSequence.SetLength(0);
886  mySequenceIdLine.SetLength(0);
887  mySequenceIdentifier.SetLength(0);
888  myPlusLine.SetLength(0);
889  myQualityString.SetLength(0);
890  myTempPartialQuality.SetLength(0);
891 }
892 
893 
894 void FastQFile::logMessage(const char* logMessage)
895 {
896  // Write the message if they are not disabled.
897  if(!myDisableMessages)
898  {
899  std::cout << logMessage << std::endl;
900  }
901 }
902 
903 
904 // Determine if it is time to quit by checking if we are to quit after a
905 // certain number of errors and that many errors have been encountered.
906 bool FastQFile::isTimeToQuit()
907 {
908  // It is time to quit if we are to quit after a certain number of errors
909  // and that many errors have been encountered.
910  if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
911  {
912  return(true);
913  }
914  return(false);
915 }
916 
917 
918 void FastQFile::printAvgQual()
919 {
920  std::cout << std::endl << "Average Phred Quality by Read Index (starts at 0):" << std::endl;
921  std::cout.precision(2);
922  std::cout << std::fixed << "Read Index\tAverage Quality"
923  << std::endl;
924  if(myQualPerCycle.size() != myCountPerCycle.size())
925  {
926  // This is a code error and should NEVER happen.
927  std::cerr << "ERROR calculating the average Qualities per cycle\n";
928  }
929 
930  double sumQual = 0;
931  double count = 0;
932  double avgQual = 0;
933  for(unsigned int i = 0; i < myQualPerCycle.size(); i++)
934  {
935  avgQual = 0;
936  if(myCountPerCycle[i] != 0)
937  {
938  avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]);
939  }
940  std::cout << i << "\t" << avgQual << "\n";
941  sumQual += myQualPerCycle[i];
942  count += myCountPerCycle[i];
943  }
944  std::cout << std::endl;
945  avgQual = 0;
946  if(count != 0)
947  {
948  avgQual = sumQual / count;
949  }
950  std::cout << "Overall Average Phred Quality = " << avgQual << std::endl;
951 }
FastQFile::openFile
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
Definition: FastQFile.cpp:83
FastQFile::isOpen
bool isOpen()
Check to see if the file is open.
Definition: FastQFile.cpp:153
FastQFile::FastQFile
FastQFile(int minReadLength=10, int numPrintableErrors=20)
Constructor.
Definition: FastQFile.cpp:30
FastQFile::disableSeqIDCheck
void disableSeqIDCheck()
Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).
Definition: FastQFile.cpp:61
BaseComposition::setBaseMapType
void setBaseMapType(BaseAsciiMap::SPACE_TYPE spaceType)
Set the base map type for this composition.
Definition: BaseComposition.h:52
BaseComposition::resetBaseMapType
void resetBaseMapType()
Reset the base map type for this composition.
Definition: BaseComposition.h:46
String
Definition: StringBasics.h:39
FastQFile::enableMessages
void enableMessages()
Enable messages - write to cout.
Definition: FastQFile.cpp:53
BaseUtilities::getPhredBaseQuality
static uint8_t getPhredBaseQuality(char charQuality)
Get phred base quality from the specified ascii quality.
Definition: BaseUtilities.cpp:65
FastQStatus::FASTQ_CLOSE_ERROR
@ FASTQ_CLOSE_ERROR
means the file could not be closed.
Definition: FastQStatus.h:36
ifopen
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
BaseComposition::clear
void clear()
Clear the composition stored in the base count vector.
Definition: BaseComposition.cpp:89
FastQFile::disableMessages
void disableMessages()
Disable messages - do not write to cout.
Definition: FastQFile.cpp:47
FastQStatus::FASTQ_OPEN_ERROR
@ FASTQ_OPEN_ERROR
means the file could not be opened.
Definition: FastQStatus.h:35
InputFile::isOpen
bool isOpen() const
Returns whether or not the file was successfully opened.
Definition: InputFile.h:423
BaseAsciiMap::SPACE_TYPE
SPACE_TYPE
The type of space (color or base) to use in the mapping.
Definition: BaseAsciiMap.h:44
ifeof
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition: InputFile.h:654
FastQFile::closeFile
FastQStatus::Status closeFile()
Close a FastQFile.
Definition: FastQFile.cpp:125
FastQStatus::FASTQ_ORDER_ERROR
@ FASTQ_ORDER_ERROR
means the methods are called out of order, like trying to read a file before opening it.
Definition: FastQStatus.h:34
FastQStatus::FASTQ_READ_ERROR
@ FASTQ_READ_ERROR
means that a problem occurred on a read.
Definition: FastQStatus.h:37
FastQFile::keepReadingFile
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
Definition: FastQFile.cpp:184
InputFile.h
FastQStatus::FASTQ_INVALID
@ FASTQ_INVALID
means that the sequence was invalid.
Definition: FastQStatus.h:33
BaseComposition::updateComposition
bool updateComposition(unsigned int rawSequenceCharIndex, char baseChar)
Update the composition for the specified index with the specified character.
Definition: BaseComposition.cpp:32
FastQFile::setMaxErrors
void setMaxErrors(int maxErrors)
Set the number of errors after which to quit reading/validating a file, defaults to -1.
Definition: FastQFile.cpp:76
BaseComposition::print
void print()
Print the composition.
Definition: BaseComposition.cpp:70
FastQStatus::FASTQ_SUCCESS
@ FASTQ_SUCCESS
indicates method finished successfully.
Definition: FastQStatus.h:32
ifclose
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
FastQFile::readFastQSequence
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
Definition: FastQFile.cpp:299
FastQFile::validateFastQFile
FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
Validate the specified fastq file.
Definition: FastQFile.cpp:195
FastQStatus::FASTQ_NO_SEQUENCE_ERROR
@ FASTQ_NO_SEQUENCE_ERROR
means there were no errors, but no sequences read.
Definition: FastQStatus.h:38
FastQFile::enableSeqIDCheck
void enableSeqIDCheck()
Enable Unique Sequence ID checking.
Definition: FastQFile.cpp:69
FastQStatus::Status
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
Definition: FastQStatus.h:31
FastQFile::isEof
bool isEof()
Check to see if the file is at the end of the file.
Definition: FastQFile.cpp:168