libStatGen Software  1
FastQFile.h
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef __FASTQ_VALIDATOR_H__
19 #define __FASTQ_VALIDATOR_H__
20 
21 #include <iostream>
22 #include <map>
23 #include "StringBasics.h"
24 #include "InputFile.h"
25 #include "BaseComposition.h"
26 #include "FastQStatus.h"
27 
28 /// Class for reading/validating a fastq file.
29 class FastQFile
30 {
31  public:
32  /// Constructor.
33  /// /param minReadLength The minimum length that a base sequence must be for
34  /// it to be valid.
35  /// \param numPrintableErrors The maximum number of errors that should be reported
36  /// in detail before suppressing the errors.
37  FastQFile(int minReadLength = 10, int numPrintableErrors = 20);
38 
39  /// Disable messages - do not write to cout.
40  void disableMessages();
41 
42  /// Enable messages - write to cout.
43  void enableMessages();
44 
45  /// Disable Unique Sequence ID checking
46  /// (Unique Sequence ID checking is enabled by default).
47  void disableSeqIDCheck();
48 
49  /// Enable Unique Sequence ID checking.
50  /// (Unique Sequence ID checking is enabled by default).
51  void enableSeqIDCheck();
52 
53  /// Interleaved.
54  void interleaved();
55 
56  /// Set the number of errors after which to quit reading/validating a file,
57  /// defaults to -1.
58  /// \param maxErrors # of errors before quitting,
59  /// -1 indicates to not quit until the entire file has been read/validated (default),
60  /// 0 indicates to quit without reading/validating anything.
61  void setMaxErrors(int maxErrors);
62 
63  /// Open a FastQFile.
64  /// Use the specified SPACE_TYPE to determine BASE, COLOR, or UNKNOWN.
65  FastQStatus::Status openFile(const char* fileName,
67 
68  /// Close a FastQFile.
70 
71  /// Check to see if the file is open.
72  bool isOpen();
73 
74  /// Check to see if the file is at the end of the file.
75  bool isEof();
76 
77  /// Returns whether or not to keep reading the file,
78  /// it stops reading (false) if eof or there is a problem reading the file.
79  bool keepReadingFile();
80 
81  /// Validate the specified fastq file
82  /// \param filename fastq file to be validated.
83  /// \param printBaseComp whether or not to print the base composition for the file.
84  /// true means print it, false means do not.
85  /// \param spaceType the spaceType to use for validation - BASE_SPACE, COLOR_SPACE,
86  /// or UNKNOWN (UNKNOWN means to determine the spaceType to
87  /// validate against from the first character of the first
88  /// sequence).
89  /// \param printQualAvg whether or not to print the quality averages for the file.
90  /// true means to print it, false (default) means do not.
91  /// \return the fastq validation status, SUCCESS on a successfully
92  /// validated fastq file.
94  bool printBaseComp,
95  BaseAsciiMap::SPACE_TYPE spaceType,
96  bool printQualAvg = false);
97 
98  /// Read 1 FastQSequence, validating it.
100 
101  ///////////////////////
102  /// @name Public Sequence Line variables.
103  /// Keep public variables for a sequence's line so they can be accessed
104  /// without having to do string copies.
105  //@{
106  String myRawSequence;
107  String mySequenceIdLine;
108  String mySequenceIdentifier;
109  String myPlusLine;
110  String myQualityString;
111  //@}
112 
113  /// Get the space type used for this file.
115  {
116  return(myBaseComposition.getSpaceType());
117  }
118 
119 private:
120  // Validates a single fastq sequence from myFile.
121  bool validateFastQSequence();
122 
123  // Reads and validates the sequence identifier line of a fastq sequence.
124  bool validateSequenceIdentifierLine();
125 
126  // Reads and validates the raw sequence line(s) and the plus line. Both are
127  // included in one method since it is unknown when the raw sequence line
128  // ends until you find the plus line that divides it from the quality
129  // string. Since this method will read the plus line to know when the
130  // raw sequence ends, it also validates that line.
131  bool validateRawSequenceAndPlusLines();
132 
133  // Reads and validates the quality string line(s).
134  bool validateQualityStringLines();
135 
136  // Method to validate a line that contains part of the raw sequence.
137  // offset specifies where in the sequence to start validating.
138  bool validateRawSequence(int offset);
139 
140  // Method to validate the "+" line that seperates the raw sequence and the
141  // quality string.
142  bool validateSequencePlus();
143 
144  // Method to validate the quality string.
145  // offset specifies where in the quality string to start validating.
146  bool validateQualityString(int offset);
147 
148  // Helper method to read a line from the input file into a string.
149  // It also tracks the line number.
150  void readLine();
151 
152  // Helper method for printing the contents of myErrorString. It will
153  // only print the errors until the maximum number of reportable errors is
154  // reached.
155  void reportErrorOnLine();
156 
157  // Reset the member data for each fastq file.
158  void reset();
159 
160  // Reset the member data for each sequence.
161  void resetForEachSequence();
162 
163  // Log the specified message if enabled.
164  void logMessage(const char* message);
165 
166  // Determine if it is time to quit by checking if we are to quit after a
167  // certain number of errors and that many errors have been encountered.
168  bool isTimeToQuit();
169 
170  void printAvgQual();
171 
172  //////////////////////////////////////////////////////////////////////
173  // Following member data elements are reset for each validated sequence.
174  //
175 
176  // Buffer for storing the contents of the line read.
177  // Stored as member data so memory allocation is only done once.
178  String myLineBuffer;
179 
180  // Buffer for storing the error string. This prevents the reallocation of
181  // the string buffer for each error.
182  String myErrorString;
183 
184  String myTempPartialQuality;
185 
186  //////////////////////////////////////////////////////////////////////
187  // Following member data elements are reset for each validated file.
188  //
189  IFILE myFile; // Input file to be read.
190  String myFileName; // Name of file being processed.
191  int myNumErrors; // Tracks the number of errors.
192  unsigned int myLineNum; // Track the line number - used for reporting errors.
193  BaseComposition myBaseComposition; // Tracks the base composition.
194  std::vector<int> myQualPerCycle; // Tracks the quality by cycle.
195  std::vector<int> myCountPerCycle; // Tracks the number of entries by cycle.
196 
197  // Whether or not to check the sequence identifier for uniqueness.
198  // Checking may use up a lot of memory.
199  bool myCheckSeqID;
200 
201  // Whether or not to check that the file is interleaved.
202  // Disabled by myCheckSeqID
203  bool myInterleaved;
204 
205  // Previous sequence id for checking interleaved.
206  std::string myPrevSeqID;
207 
208  // Map to track which identifiers have appeared in the file.
209  std::map<std::string, unsigned int> myIdentifierMap;
210 
211  //////////////////////////////////////////////////////////////////////
212  // Following member data do not change for each call to the validator.
213  //
214  int myMinReadLength; // Min Length for a read.
215  int myNumPrintableErrors; // Max number of errors to print the details of.
216 
217  // Number of errors after which to quit reading/validating a file.
218  // Defaults to -1.
219  // -1 indicates to not quit until the entire file has been read/validated.
220  // 0 indicates to quit without reading/validating anything.
221  int myMaxErrors;
222 
223  // Whether or not messages should be printed.
224  // Defaulted to false (they should be printed).
225  bool myDisableMessages;
226 
227  // Track if there is a problem reading the file. If there are read
228  // problems, stop reading the file.
229  bool myFileProblem;
230 };
231 
232 #endif
SPACE_TYPE
The type of space (color or base) to use in the mapping.
Definition: BaseAsciiMap.h:44
@ UNKNOWN
Base decision on the first raw seq character/type has yet to be determined.
Definition: BaseAsciiMap.h:47
Class that tracks the composition of base by read location.
BaseAsciiMap::SPACE_TYPE getSpaceType()
Get the space type for this composition.
Class for reading/validating a fastq file.
Definition: FastQFile.h:30
void interleaved()
Interleaved.
Definition: FastQFile.cpp:78
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
Definition: FastQFile.cpp:92
void enableSeqIDCheck()
Enable Unique Sequence ID checking.
Definition: FastQFile.cpp:71
void disableMessages()
Disable messages - do not write to cout.
Definition: FastQFile.cpp:49
bool isOpen()
Check to see if the file is open.
Definition: FastQFile.cpp:162
void disableSeqIDCheck()
Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).
Definition: FastQFile.cpp:63
BaseAsciiMap::SPACE_TYPE getSpaceType()
Get the space type used for this file.
Definition: FastQFile.h:114
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
Definition: FastQFile.cpp:309
FastQStatus::Status closeFile()
Close a FastQFile.
Definition: FastQFile.cpp:134
FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
Validate the specified fastq file.
Definition: FastQFile.cpp:204
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
Definition: FastQFile.cpp:193
void enableMessages()
Enable messages - write to cout.
Definition: FastQFile.cpp:55
void setMaxErrors(int maxErrors)
Set the number of errors after which to quit reading/validating a file, defaults to -1.
Definition: FastQFile.cpp:85
FastQFile(int minReadLength=10, int numPrintableErrors=20)
Constructor.
Definition: FastQFile.cpp:30
bool isEof()
Check to see if the file is at the end of the file.
Definition: FastQFile.cpp:177
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
Definition: FastQStatus.h:31
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37