libStatGen Software 1
FastQFile Class Reference

Class for reading/validating a fastq file. More...

#include <FastQFile.h>

Collaboration diagram for FastQFile:

Public Member Functions

 FastQFile (int minReadLength=10, int numPrintableErrors=20)
 Constructor. More...
 
void disableMessages ()
 Disable messages - do not write to cout. More...
 
void enableMessages ()
 Enable messages - write to cout. More...
 
void disableSeqIDCheck ()
 Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default). More...
 
void enableSeqIDCheck ()
 Enable Unique Sequence ID checking. More...
 
void interleaved ()
 Interleaved. More...
 
void setMaxErrors (int maxErrors)
 Set the number of errors after which to quit reading/validating a file, defaults to -1. More...
 
FastQStatus::Status openFile (const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
 Open a FastQFile. More...
 
FastQStatus::Status closeFile ()
 Close a FastQFile. More...
 
bool isOpen ()
 Check to see if the file is open. More...
 
bool isEof ()
 Check to see if the file is at the end of the file. More...
 
bool keepReadingFile ()
 Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a problem reading the file. More...
 
FastQStatus::Status validateFastQFile (const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
 Validate the specified fastq file. More...
 
FastQStatus::Status readFastQSequence ()
 Read 1 FastQSequence, validating it. More...
 

Public Sequence Line variables.

Keep public variables for a sequence's line so they can be accessed without having to do string copies.

String myRawSequence
 
String mySequenceIdLine
 
String mySequenceIdentifier
 
String myPlusLine
 
String myQualityString
 
BaseAsciiMap::SPACE_TYPE getSpaceType ()
 Get the space type used for this file. More...
 

Detailed Description

Class for reading/validating a fastq file.

Definition at line 29 of file FastQFile.h.

Constructor & Destructor Documentation

◆ FastQFile()

FastQFile::FastQFile ( int  minReadLength = 10,
int  numPrintableErrors = 20 
)

Constructor.

/param minReadLength The minimum length that a base sequence must be for it to be valid.

Parameters
numPrintableErrorsThe maximum number of errors that should be reported in detail before suppressing the errors.

Definition at line 30 of file FastQFile.cpp.

31 : myFile(NULL),
32 myBaseComposition(),
33 myQualPerCycle(),
34 myCountPerCycle(),
35 myCheckSeqID(true),
36 myInterleaved(false),
37 myPrevSeqID(""),
38 myMinReadLength(minReadLength),
39 myNumPrintableErrors(numPrintableErrors),
40 myMaxErrors(-1),
41 myDisableMessages(false),
42 myFileProblem(false)
43{
44 // Reset the member data.
45 reset();
46}

Member Function Documentation

◆ closeFile()

FastQStatus::Status FastQFile::closeFile ( )

Close a FastQFile.

Definition at line 134 of file FastQFile.cpp.

135{
136 int closeStatus = 0; // Success.
137
138 // If a file has been opened, close it.
139 if(myFile != NULL)
140 {
141 // Close the file.
142 closeStatus = ifclose(myFile);
143 myFile = NULL;
144 }
145 if(closeStatus == 0)
146 {
147 // Success - either there wasn't a file to close or it was closed
148 // successfully.
150 }
151 else
152 {
153 std::string errorMessage = "Failed to close file: ";
154 errorMessage += myFileName.c_str();
155 logMessage(errorMessage.c_str());
157 }
158}
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
@ FASTQ_SUCCESS
indicates method finished successfully.
Definition: FastQStatus.h:32
@ FASTQ_CLOSE_ERROR
means the file could not be closed.
Definition: FastQStatus.h:36

References FastQStatus::FASTQ_CLOSE_ERROR, FastQStatus::FASTQ_SUCCESS, and ifclose().

Referenced by openFile(), and validateFastQFile().

◆ disableMessages()

void FastQFile::disableMessages ( )

Disable messages - do not write to cout.

Definition at line 49 of file FastQFile.cpp.

50{
51 myDisableMessages = true;
52}

◆ disableSeqIDCheck()

void FastQFile::disableSeqIDCheck ( )

Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).

Definition at line 63 of file FastQFile.cpp.

64{
65 myCheckSeqID = false;
66}

◆ enableMessages()

void FastQFile::enableMessages ( )

Enable messages - write to cout.

Definition at line 55 of file FastQFile.cpp.

56{
57 myDisableMessages = false;
58}

◆ enableSeqIDCheck()

void FastQFile::enableSeqIDCheck ( )

Enable Unique Sequence ID checking.

(Unique Sequence ID checking is enabled by default).

Definition at line 71 of file FastQFile.cpp.

72{
73 myCheckSeqID = true;
74}

◆ getSpaceType()

BaseAsciiMap::SPACE_TYPE FastQFile::getSpaceType ( )
inline

Get the space type used for this file.

Definition at line 114 of file FastQFile.h.

115 {
116 return(myBaseComposition.getSpaceType());
117 }
BaseAsciiMap::SPACE_TYPE getSpaceType()
Get the space type for this composition.

References BaseComposition::getSpaceType().

◆ interleaved()

void FastQFile::interleaved ( )

Interleaved.

Definition at line 78 of file FastQFile.cpp.

79{
80 myInterleaved = true;
81}

◆ isEof()

bool FastQFile::isEof ( )

Check to see if the file is at the end of the file.

Definition at line 177 of file FastQFile.cpp.

178{
179 // Check to see if the file is open.
180 if((myFile != NULL) && (ifeof(myFile)))
181 {
182 // At EOF.
183 return true;
184 }
185
186 // Not at EOF.
187 return false;
188}
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition: InputFile.h:654

References ifeof().

Referenced by keepReadingFile().

◆ isOpen()

bool FastQFile::isOpen ( )

Check to see if the file is open.

Definition at line 162 of file FastQFile.cpp.

163{
164 // Check to see if the file is open.
165 if((myFile != NULL) && (myFile->isOpen()))
166 {
167 // File pointer exists and the file is open.
168 return true;
169 }
170
171 // File is not open.
172 return false;
173}
bool isOpen() const
Returns whether or not the file was successfully opened.
Definition: InputFile.h:423

References InputFile::isOpen().

Referenced by readFastQSequence().

◆ keepReadingFile()

bool FastQFile::keepReadingFile ( )

Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a problem reading the file.

Definition at line 193 of file FastQFile.cpp.

194{
195 if(isEof() || myFileProblem)
196 {
197 return(false);
198 }
199 return(true);
200}
bool isEof()
Check to see if the file is at the end of the file.
Definition: FastQFile.cpp:177

References isEof().

Referenced by validateFastQFile().

◆ openFile()

FastQStatus::Status FastQFile::openFile ( const char *  fileName,
BaseAsciiMap::SPACE_TYPE  spaceType = BaseAsciiMap::UNKNOWN 
)

Open a FastQFile.

Use the specified SPACE_TYPE to determine BASE, COLOR, or UNKNOWN.

Definition at line 92 of file FastQFile.cpp.

94{
95 // reset the member data.
96 reset();
97
98 myBaseComposition.resetBaseMapType();
99 myBaseComposition.setBaseMapType(spaceType);
100 myQualPerCycle.clear();
101 myCountPerCycle.clear();
102
104
105 // Close the file if there is already one open - checked by close.
106 status = closeFile();
107 if(status == FastQStatus::FASTQ_SUCCESS)
108 {
109 // Successfully closed a previously opened file if there was one.
110
111 // Open the file
112 myFile = ifopen(fileName, "rt");
113 myFileName = fileName;
114
115 if(myFile == NULL)
116 {
117 // Failed to open the file.
119 }
120 }
121
122 if(status != FastQStatus::FASTQ_SUCCESS)
123 {
124 // Failed to open the file.
125 std::string errorMessage = "ERROR: Failed to open file: ";
126 errorMessage += fileName;
127 logMessage(errorMessage.c_str());
128 }
129 return(status);
130}
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
void setBaseMapType(BaseAsciiMap::SPACE_TYPE spaceType)
Set the base map type for this composition.
void resetBaseMapType()
Reset the base map type for this composition.
FastQStatus::Status closeFile()
Close a FastQFile.
Definition: FastQFile.cpp:134
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
Definition: FastQStatus.h:31
@ FASTQ_OPEN_ERROR
means the file could not be opened.
Definition: FastQStatus.h:35

References closeFile(), FastQStatus::FASTQ_OPEN_ERROR, FastQStatus::FASTQ_SUCCESS, ifopen(), BaseComposition::resetBaseMapType(), and BaseComposition::setBaseMapType().

Referenced by validateFastQFile().

◆ readFastQSequence()

FastQStatus::Status FastQFile::readFastQSequence ( )

Read 1 FastQSequence, validating it.

Definition at line 309 of file FastQFile.cpp.

310{
311 // First verify that a file is open, if not, return failure.
312 if(!isOpen())
313 {
314 std::string message =
315 "ERROR: Trying to read a fastq file but no file is open.";
316 logMessage(message.c_str());
318 }
319
320 // Reset variables for each sequence.
321 resetForEachSequence();
322
323 bool valid = true;
324
325 // No sequence was read.
326 if(isTimeToQuit())
327 {
329 }
330
331 // The first line is the sequence identifier, so validate that.
332 valid = validateSequenceIdentifierLine();
333
334 if(myFileProblem)
335 {
337 }
338
339 // If we are at the end of the file, check to see if it is a partial
340 // sequence or just an empty line at the end.
341 if(ifeof(myFile))
342 {
343 // If the sequence identifier line was empty and we are at the
344 // end of the file, there is nothing more to validate.
345 if(mySequenceIdLine.Length() != 0)
346 {
347 // There was a sequence identifier line, so this is an incomplete
348 // sequence.
349 myErrorString = "Incomplete Sequence.\n";
350 reportErrorOnLine();
351
352 valid = false;
353 }
354 if(valid)
355 {
356 // Return failure - no sequences were left to read. At the end
357 // of the file. It wasn't invalid and it wasn't really an error.
359 }
360 else
361 {
363 }
364 }
365
366 // If enough errors, quit before reading any more.
367 if(isTimeToQuit())
368 {
369 // Means there was an error, so mark it as invalid.
371 }
372
373 // Validate the Raw Sequence Line(s) and the "+" line.
374 valid &= validateRawSequenceAndPlusLines();
375
376 if(myFileProblem)
377 {
379 }
380
381 // If enough errors, quit before reading any more.
382 if(isTimeToQuit())
383 {
385 }
386
387 // If it is the end of a file, it is missing the quality string.
388 if(ifeof(myFile))
389 {
390 // There was a sequence identifier line, so this is an incomplete
391 // sequence.
392 myErrorString = "Incomplete Sequence, missing Quality String.";
393 reportErrorOnLine();
394 valid = false;
396 }
397
398 // All that is left is to validate the quality string line(s).
399 valid &= validateQualityStringLines();
400
401 if(myFileProblem)
402 {
404 }
405
406 if(valid)
407 {
409 }
411}
bool isOpen()
Check to see if the file is open.
Definition: FastQFile.cpp:162
@ FASTQ_ORDER_ERROR
means the methods are called out of order, like trying to read a file before opening it.
Definition: FastQStatus.h:34
@ FASTQ_READ_ERROR
means that a problem occurred on a read.
Definition: FastQStatus.h:37
@ FASTQ_INVALID
means that the sequence was invalid.
Definition: FastQStatus.h:33
@ FASTQ_NO_SEQUENCE_ERROR
means there were no errors, but no sequences read.
Definition: FastQStatus.h:38

References FastQStatus::FASTQ_INVALID, FastQStatus::FASTQ_NO_SEQUENCE_ERROR, FastQStatus::FASTQ_ORDER_ERROR, FastQStatus::FASTQ_READ_ERROR, FastQStatus::FASTQ_SUCCESS, ifeof(), and isOpen().

Referenced by validateFastQFile().

◆ setMaxErrors()

void FastQFile::setMaxErrors ( int  maxErrors)

Set the number of errors after which to quit reading/validating a file, defaults to -1.

Parameters
maxErrors# of errors before quitting, -1 indicates to not quit until the entire file has been read/validated (default), 0 indicates to quit without reading/validating anything.

Definition at line 85 of file FastQFile.cpp.

86{
87 myMaxErrors = maxErrors;
88}

◆ validateFastQFile()

FastQStatus::Status FastQFile::validateFastQFile ( const String filename,
bool  printBaseComp,
BaseAsciiMap::SPACE_TYPE  spaceType,
bool  printQualAvg = false 
)

Validate the specified fastq file.

Parameters
filenamefastq file to be validated.
printBaseCompwhether or not to print the base composition for the file. true means print it, false means do not.
spaceTypethe spaceType to use for validation - BASE_SPACE, COLOR_SPACE, or UNKNOWN (UNKNOWN means to determine the spaceType to validate against from the first character of the first sequence).
printQualAvgwhether or not to print the quality averages for the file. true means to print it, false (default) means do not.
Returns
the fastq validation status, SUCCESS on a successfully validated fastq file.

Definition at line 204 of file FastQFile.cpp.

208{
209 // Open the fastqfile.
210 if(openFile(filename, spaceType) != FastQStatus::FASTQ_SUCCESS)
211 {
212 // Failed to open the specified file.
214 }
215
216 // Track the total number of sequences that were validated.
217 int numSequences = 0;
218
219 // Keep reading the file until there are no more fastq sequences to process
220 // and not configured to quit after a certain number of errors or there
221 // has not yet been that many errors.
222 // Or exit if there is a problem reading the file.
224 while (keepReadingFile() &&
225 ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
226 {
227 // Validate one sequence. This call will read all the lines for
228 // one sequence.
229 status = readFastQSequence();
230 if((status == FastQStatus::FASTQ_SUCCESS) || (status == FastQStatus::FASTQ_INVALID))
231 {
232 // Read a sequence and it is either valid or invalid, but
233 // either way, a sequence was read, so increment the sequence count.
234 ++numSequences;
235 }
236 else
237 {
238 // Other error, so break out of processing.
239 break;
240 }
241 }
242
243 // Report Base Composition Statistics.
244 if(printBaseComp)
245 {
246 myBaseComposition.print();
247 }
248
249 if(printQualAvg)
250 {
251 printAvgQual();
252 }
253
254 std::string finishMessage = "Finished processing ";
255 finishMessage += myFileName.c_str();
256 char buffer[100];
257 if(sprintf(buffer,
258 " with %u lines containing %d sequences.",
259 myLineNum, numSequences) > 0)
260 {
261 finishMessage += buffer;
262 logMessage(finishMessage.c_str());
263 }
264 if(sprintf(buffer,
265 "There were a total of %d errors.",
266 myNumErrors) > 0)
267 {
268 logMessage(buffer);
269 }
270
271 // Close the input file.
272 FastQStatus::Status closeStatus = closeFile();
273
274 if((status != FastQStatus::FASTQ_SUCCESS) && (status != FastQStatus::FASTQ_INVALID) &&
276 {
277 // Stopped validating due to some error other than invalid, so
278 // return that error.
279 return(status);
280 }
281 else if(myNumErrors == 0)
282 {
283 // No errors, check to see if there were any sequences.
284 // Finished processing all of the sequences in the file.
285 // If there are no sequences, report an error.
286 if(numSequences == 0)
287 {
288 // Empty file, return error.
289 logMessage("ERROR: No FastQSequences in the file.");
291 }
293 }
294 else
295 {
296 // The file is invalid. But check the close status. If the close
297 // failed, it means there is a problem with the file itself not just
298 // with validation, so the close failure should be returned.
299 if(closeStatus != FastQStatus::FASTQ_SUCCESS)
300 {
301 return(closeStatus);
302 }
304 }
305}
void print()
Print the composition.
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
Definition: FastQFile.cpp:92
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
Definition: FastQFile.cpp:309
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
Definition: FastQFile.cpp:193

References closeFile(), FastQStatus::FASTQ_INVALID, FastQStatus::FASTQ_NO_SEQUENCE_ERROR, FastQStatus::FASTQ_OPEN_ERROR, FastQStatus::FASTQ_SUCCESS, keepReadingFile(), openFile(), BaseComposition::print(), and readFastQSequence().

Member Data Documentation

◆ myPlusLine

String FastQFile::myPlusLine

Definition at line 109 of file FastQFile.h.

◆ myQualityString

String FastQFile::myQualityString

Definition at line 110 of file FastQFile.h.

◆ myRawSequence

String FastQFile::myRawSequence

Definition at line 106 of file FastQFile.h.

◆ mySequenceIdentifier

String FastQFile::mySequenceIdentifier

Definition at line 108 of file FastQFile.h.

◆ mySequenceIdLine

String FastQFile::mySequenceIdLine

Definition at line 107 of file FastQFile.h.


The documentation for this class was generated from the following files: