libStatGen Software 1
GenomeSequence.h
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef _GENOME_SEQUENCE_H
19#define _GENOME_SEQUENCE_H
20
21#include <sys/types.h>
22#include <sys/stat.h>
23#if !defined(MD5_DIGEST_LENGTH)
24#define MD5_DIGEST_LENGTH 16
25#endif
26#include <string>
27#include "MemoryMapArray.h"
28#include "BaseAsciiMap.h"
29
30// Goncalo's String class
31#include "StringArray.h"
32
33#include <stdint.h>
34
35// stdint.h will define this, but only if __STDC_LIMIT_MACROS was
36// defined prior to the first include of stdint.h
37#ifndef UINT32_MAX
38#define UINT32_MAX 0xFFFFFFFF
39#endif
40
41typedef uint32_t genomeIndex_t;
42#define INVALID_GENOME_INDEX UINT32_MAX
43
44// chromosome index is just a signed int, so this is ok here:
45#define INVALID_CHROMOSOME_INDEX -1
46
47#include "GenomeSequenceHelpers.h"
48
49#define UMFA_COOKIE 0x1b7933a1 // unique cookie id
50#define UMFA_VERSION 20100401U // YYYYMMDD of last change to file layout
51
52typedef MemoryMapArray<
53uint32_t,
54genomeIndex_t,
55UMFA_COOKIE,
56UMFA_VERSION,
57PackedAccess_4Bit,
58PackedAssign_4Bit,
59Packed4BitElementCount2Bytes,
62
63
64// std::string &operator = (std::string &lhs, const PackedRead &rhs);
65//
66
67//! Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
68/*!
69 GenomeSequence is designed to be a high performance shared access
70 reference object.
71
72 It is implemented as a MemoryMapArray template object with unsigned 8
73 bit ints, each of which stores two bases. Although 2 bits could be used,
74 most references have more than four symbols (usually at least including
75 'N', indicating an unknown or masked out base).
76
77 Normal use of this class follows these steps:
78 -# create the reference
79 -# instantiate the GenomeSequence class object
80 -# create the actual file (memory mapped) that is to hold the data
81 -# populate the data using GenomeSequence::set
82 -# use the reference
83 -# use the reference by instantiating a GenomeSequence object
84 -# either use the constructor with the reference filename
85 -# or use GenomeSequence::setReferenceName() followed by ::open
86 -# access the bases via the overloaded array operator []
87 -# check sequence length by using GenomeSequence::getNumberBases()
88 -# accessing chromosomes in the reference
89 -# you typically will need to know about the chromosomes in the sequence
90 -# see methods and docs with prefix 'getChromosome'
91
92 Sharing is accomplished using the mmap() function via the MemoryMap
93 base class. This allows a potentially large genome reference to be
94 shared among a number of simultaneously executing instances of one or
95 more programs sharing the same reference.
96 */
97
98
100{
101private:
102 int _debugFlag;
103
104 std::ostream *_progressStream;
105 bool _colorSpace;
106
107 // Whether or not to overwrite an existing file when creating a umfa file (via create).
108 bool _createOverwrite;
109
110 std::string _baseFilename; // for later use by WordIndex create and open
111 std::string _referenceFilename;
112 std::string _fastaFilename;
113 std::string _umfaFilename;
114 std::string _application; // only used in ::create()
115
116 MemoryMap _umfaFile;
117
118 void setup(const char *referenceFilename);
119
120public:
121 /// Simple constructor - no implicit file open
123 void constructorClear();
124 /// \brief attempt to open an existing sequence
125 ///
126 /// \param referenceFilename the name of the reference fasta file to open
127 /// \param debug if true, additional debug information is printed
128 GenomeSequence(std::string &referenceFilename)
129 {
130 constructorClear();
131 setup(referenceFilename.c_str());
132 }
133
134 /// Smarter constructor - attempt to open an existing sequence
135 ///
136 /// \param referenceFilename the name of the reference fasta file to open
137 /// \param debug if true, additional debug information is printed
138 GenomeSequence(const char *referenceFilename)
139 {
140 constructorClear();
141 setup(referenceFilename);
142 }
143
144 /// Close the file if open and destroy the object
146
147 /// open the reference specified using GenomeSequence::setReferenceName
148 ///
149 /// \param isColorSpace open the color space reference
150 /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
151 /// \return false for success, true otherwise
152 bool open(bool isColorSpace = false, int flags = O_RDONLY);
153
154 /// open the given file as the genome (no filename munging occurs).
155 ///
156 /// \param filename the name of the file to open
157 /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
158 /// \return false for success, true otherwise
159 bool open(const char *filename, int flags = O_RDONLY)
160 {
161 _umfaFilename = filename;
162 // TODO - should this method be doing something???
163 return false;
164 }
165
166private:
167 bool _searchCommonFileSuffix;
168public:
169 bool create(bool isColor = false);
170
171 // NEW API?
172
173 // load time modifiers:
174 /// if set, then show progress when creating and pre-fetching
175 void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
176 void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
177 void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
178
179 // Set whether or not to overwrite a umfa file when calling create.
180 void setCreateOverwrite(bool createOverwrite) {_createOverwrite = createOverwrite;}
181
182 bool loadFastaData(const char *filename);
183
184 /// set the reference name that will be used in open()
185 /// \param referenceFilename the name of the reference fasta file to open
186 /// \return false for success, true otherwise
187 ///
188 /// \sa open()
189 bool setReferenceName(std::string referenceFilename);
190
191 /// set the application name in the binary file header
192 ///
193 /// \param application name of the application
194 void setApplication(std::string application)
195 {
196 _application = application; // used in ::create() to set application name
197 }
198 const std::string &getFastaName() const
199 {
200 return _fastaFilename;
201 }
202 const std::string &getReferenceName() const
203 {
204 return _referenceFilename;
205 }
206
207 /// tell us if we are a color space reference or not
208 /// \return true if colorspace, false otherwise
209 bool isColorSpace() const
210 {
211 return _colorSpace;
212 }
213
214 /// return the number of bases represented in this reference
215 /// \return count of bases
216 genomeIndex_t getNumberBases() const
217 {
218 return getElementCount();
219 }
220
221 /// given a whole genome index, get the chromosome it is located in
222 ///
223 /// This is done via a binary search of the chromosome table in the
224 /// header of the mapped file, so it is O(log(N))
225 ///
226 /// \param 0-based position the base in the genome
227 /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
228 int getChromosome(genomeIndex_t position) const;
229
230 /// given a chromosome name, return the chromosome index
231 ///
232 /// This is done via a linear search of the chromosome table in the
233 /// header of the mapped file, so it is O(N)
234 ///
235 /// \param chromosomeName the name of the chromosome - exact match only
236 /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
237 int getChromosome(const char *chromosomeName) const;
238 /// Return the number of chromosomes in the genome
239 /// \return number of chromosomes in the genome
240 int getChromosomeCount() const;
241
242 /// given a chromosome, return the genome base it starts in
243 ///
244 /// \param 0-based chromosome index
245 /// \return 0-based genome index of the base that starts the chromosome
246 genomeIndex_t getChromosomeStart(int chromosomeIndex) const
247 {
248 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
249 return header->_chromosomes[chromosomeIndex].start;
250 }
251
252 /// given a chromosome, return its size in bases
253 ///
254 /// \param 0-based chromosome index
255 /// \return size of the chromosome in bases
256 genomeIndex_t getChromosomeSize(int chromosomeIndex) const
257 {
258 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
259 return header->_chromosomes[chromosomeIndex].size;
260 }
261
262 /// given a chromosome name and position, return the genome position
263 ///
264 /// \param chromosomeName name of the chromosome - exact match only
265 /// \param chromosomeIndex 1-based chromosome position
266 /// \return genome index of the above chromosome position
267 genomeIndex_t getGenomePosition(
268 const char *chromosomeName,
269 unsigned int chromosomeIndex) const;
270
271 /// given a chromosome index and position, return the genome position
272 ///
273 /// \param chromosome index of the chromosome
274 /// \param chromosomeIndex 1-based chromosome position
275 /// \return genome index of the above chromosome position
276 genomeIndex_t getGenomePosition(
277 int chromosome,
278 unsigned int chromosomeIndex) const;
279
280 /// given the chromosome name, get the corresponding 0 based genome index
281 /// for the start of that chromosome
282 genomeIndex_t getGenomePosition(const char *chromosomeName) const;
283 genomeIndex_t getGenomePosition(int chromosomeIndex) const;
284
285 const std::string &getBaseFilename() const
286 {
287 return _baseFilename;
288 }
289
290 const char *getChromosomeName(int chromosomeIndex) const
291 {
292 return header->_chromosomes[chromosomeIndex].name;
293 }
294
295 void setDebugFlag(bool d)
296 {
297 _debugFlag = d;
298 }
299
300 genomeIndex_t sequenceLength() const
301 {
302 return (genomeIndex_t) header->elementCount;
303 }
304
305 const char *chromosomeName(int chr) const
306 {
307 return header->_chromosomes[chr].name;
308 }
309
310 void sanityCheck(MemoryMap &fasta) const;
311
312 // TODO - this will be moved somewhere else and be made a static method.
313 std::string IntegerToSeq(unsigned int n, unsigned int wordsize) const;
314
315 bool wordMatch(unsigned int index, std::string &word) const;
316 bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word) const;
317
318 // TODO - this will be moved somewhere else and be made a static method.
319 char BasePair(char c) const
320 {
321 return BaseAsciiMap::base2complement[(int) c];
322 }
323
324 void dumpSequenceSAMDictionary(std::ostream&) const;
325 void dumpHeaderTSV(std::ostream&) const;
326
327 ///
328 /// Return the bases in base space or color space for within range index, ot
329 /// @param index the array-like index (0 based).
330 /// @return ACTGN in base space; 0123N for color space; and 'N' for invalid.
331 /// For color space, index i represents the transition of base at position (i-1) to base at position i
332 ///
333 /// NB: bounds checking here needs to be deprecated - do not assume it
334 /// will exist - the call must clip reads so that this routine is never
335 /// called with a index value larger than the genome.
336 ///
337 /// The reason for this is simply that this routine gets called hundreds
338 /// of billions of time in one run of karma, which will absolutely kill
339 /// performance. Every single instruction here matters a great, great deal.
340 ///
341
342 //
343 // 3.5% improvement for color space matching:
344 // I guess the compiler isn't inlining 3 functions deep.
345 //
346
347#if 0
348 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
349 // The following code does not work even in the base space,
350 // since the memory layout has changed. USE IT WITH CAUTIOUS!
351 //
352 // This block of code is a functional duplicate of the following
353 // code - leave this here for reference and possibly later
354 // performance testing as well as compiler evaluation.
355 inline char operator[](genomeIndex_t index) const
356 {
357 return BaseAsciiMap::int2base[(*((genomeSequenceArray*) this))[index]];
358 }
359#endif
360
361 inline char operator[](genomeIndex_t index) const
362 {
363 uint8_t val;
364 if (index < getNumberBases())
365 {
366 if ((index&1)==0)
367 {
368 val = ((uint8_t *) data)[index>>1] & 0xf;
369 }
370 else
371 {
372 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
373 }
374 }
375 else
376 {
378 }
381 return val;
382 }
383
384 /// given a chromosome name and 1-based position, return the reference base.
385 /// \param chromosomeName name of the chromosome - exact match only
386 /// \param chromosomeIndex 1-based chromosome position
387 /// \return reference base at the above chromosome position
388 inline char getBase(const char *chromosomeName,
389 unsigned int chromosomeIndex) const
390 {
391 genomeIndex_t index =
392 getGenomePosition(chromosomeName, chromosomeIndex);
393 if(index == INVALID_GENOME_INDEX)
394 {
395 // Invalid position, so return 'N'
396 return('N');
397 }
398 return((*this)[index]);
399 }
400
401
402 inline uint8_t getInteger(genomeIndex_t index) const
403 {
404 return (*((genomeSequenceArray*) this))[index];
405 }
406
407 inline void set(genomeIndex_t index, char value)
408 {
409 genomeSequenceArray::set(index,
410 BaseAsciiMap::base2int[(uint8_t) value]);
411 }
412
413 /// obtain the pointer to the raw data for other access methods
414 ///
415 /// this is a fairly ugly hack to reach into the
416 /// raw genome vector, get the byte that encodes
417 /// two bases, and return it. This is used by
418 /// karma ReadIndexer::getSumQ to compare genome
419 /// matchines by byte (two bases at a time) to speed
420 /// it up.
421 ///
422 uint8_t *getDataPtr(genomeIndex_t index)
423 {
424 return ((uint8_t *) data + index/2);
425 }
426private:
427 ///
428 /// when creating the genome mapped file, we call this to set
429 /// the MD5 checksum of the chromosome sequence and length so that
430 /// we can write the SAM SQ headers properly.
431 /// NB: operates on the last fully loaded chromosome.
432 bool setChromosomeMD5andLength(uint32_t whichChromosome);
433
434public:
435
436 // TODO - this will be moved somewhere else and be made a static method.
437 // replace read with the reversed one
438 void getReverseRead(std::string &read);
439
440 // TODO - this will be moved somewhere else and be made a static method.
441 void getReverseRead(String& read);
442
443 // debug the given read - print nice results
444 int debugPrintReadValidation(
445 std::string &read,
446 std::string &quality,
447 char direction,
448 genomeIndex_t readLocation,
449 int sumQuality,
450 int mismatchCount,
451 bool recurse = true
452 );
453
454 //
455 // get the sequence from this GenomeSequence using the specified chromosome and 0-based position.
456 // if baseCount < 0, get the reverse complement
457 // that starts at index (but do not reverse the string?)
458 void getString(std::string &str, int chromosome, uint32_t index, int baseCount) const;
459 void getString(String &str, int chromosome, uint32_t index, int baseCount) const;
460 //
461 // get the sequence from this GenomeSequence.
462 // if baseCount < 0, get the reverse complement
463 // that starts at index (but do not reverse the string?)
464 //
465 void getString(std::string &str, genomeIndex_t index, int baseCount) const;
466 void getString(String &str, genomeIndex_t index, int baseCount) const;
467
468 void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const;
469
470 void print30(genomeIndex_t) const;
471
472 // for debugging, not for speed:
473 genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const;
474
475
476 // TODO - these methods do not handle a CIGAR string and do not handle '=' when a read matches the reference.
477 // They are here for alignment and should be moved to the aligner (karma).
478 // OR they should optionally take a CIGAR and use that if specified....
479 // maybe they should be helper methods that are found somewhere else
480
481 /// Return the mismatch count, disregarding CIGAR strings
482 ///
483 /// \param read is the sequence we're counting mismatches in
484 /// \param location is where in the genmoe we start comparing
485 /// \param exclude is a wildcard character (e.g. '.' or 'N')
486 ///
487 /// \return number of bases that don't match the reference, except those that match exclude
488 int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
489 {
490 int mismatchCount = 0;
491 for (uint32_t i=0; i<read.size(); i++)
492 if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
493 return mismatchCount;
494 };
495
496 /// brute force sumQ - no sanity checking
497 ///
498 /// \param read shotgun sequencer read string
499 /// \param qualities phred quality string of same length
500 /// \param location the alignment location to check sumQ
501 int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
502 {
503 int sumQ = 0;
504 for (uint32_t i=0; i<read.size(); i++)
505 sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
506 return sumQ;
507 };
508 // return a string highlighting mismatch postions with '^' chars:
509 void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const;
510 void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const;
511
512 // END TODO
513
514 void getChromosomeAndIndex(std::string &, genomeIndex_t) const;
515 void getChromosomeAndIndex(String &, genomeIndex_t) const;
516
517 /// check a SAM format read, using phred quality scores and
518 /// the CIGAR string to determine if it is correct.
519 ///
520 ///
521 /// \param read the read in base space
522 /// \param qualities the phred encoded qualities (Sanger, not Illumina)
523 /// \param cigar the SAM file CIGAR column
524 /// \param sumQ if >0 on entry, is checked against the computed sumQ
525 /// \param insertions count of insertions found in
526 ///
527 ///
529 std::string &read,
530 std::string &qualities,
531 std::string &cigar,
532 int &sumQ, // input and output
533 int &gapOpenCount, // output only
534 int &gapExtendCount, // output only
535 int &gapDeleteCount, // output only
536 std::string &result
537 ) const;
538
539 bool populateDBSNP(mmapArrayBool_t &dbSNP, IFILE inputFile) const;
540
541 /// user friendly dbSNP loader.
542 ///
543 /// \param inputFileName may be empty, point to a text file or a dbSNP vector file
544 ///
545 /// In all cases, dbSNP is returned the same length as this genome.
546 ///
547 /// When no SNPs are loaded, all values are false.
548 ///
549 /// When a text file is given, the file is parsed with two space
550 /// separated columns - the first column is the chromosome name, and
551 /// the second is the 1-based chromosome position of the SNP.
552 ///
553 /// \return false if a dbSNP file was correctly loaded, true otherwise
554 ///
555 bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const;
556};
557
558#endif
static unsigned char base2complement[]
This table maps 5' base space to the 3' complement base space values, as well as 5' color space value...
Definition: BaseAsciiMap.h:41
static unsigned char base2int[256+1]
Map ASCII values to a 2 (or 3) bit encoding for the base pair value for just base space (ACTGNactgn).
Definition: BaseAsciiMap.h:61
static const char int2colorSpace[]
Convert from int representation to colorspace representation.
Definition: BaseAsciiMap.h:40
static const char int2base[]
Convert from int representation to the base.
Definition: BaseAsciiMap.h:38
static const int baseNIndex
Value associated with 'N' in the ascii to base map (bad read).
Definition: BaseAsciiMap.h:28
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const
user friendly dbSNP loader.
char getBase(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and 1-based position, return the reference base.
int getChromosomeCount() const
Return the number of chromosomes in the genome.
GenomeSequence(const char *referenceFilename)
Smarter constructor - attempt to open an existing sequence.
int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
brute force sumQ - no sanity checking
uint8_t * getDataPtr(genomeIndex_t index)
obtain the pointer to the raw data for other access methods
int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
Return the mismatch count, disregarding CIGAR strings.
void setApplication(std::string application)
set the application name in the binary file header
genomeIndex_t getChromosomeStart(int chromosomeIndex) const
given a chromosome, return the genome base it starts in
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
char operator[](genomeIndex_t index) const
Return the bases in base space or color space for within range index, ot.
~GenomeSequence()
Close the file if open and destroy the object.
bool isColorSpace() const
tell us if we are a color space reference or not
void setProgressStream(std::ostream &progressStream)
if set, then show progress when creating and pre-fetching
int getChromosome(genomeIndex_t position) const
given a whole genome index, get the chromosome it is located in
bool setReferenceName(std::string referenceFilename)
set the reference name that will be used in open()
bool open(const char *filename, int flags=O_RDONLY)
open the given file as the genome (no filename munging occurs).
genomeIndex_t getNumberBases() const
return the number of bases represented in this reference
GenomeSequence()
Simple constructor - no implicit file open.
bool checkRead(std::string &read, std::string &qualities, std::string &cigar, int &sumQ, int &gapOpenCount, int &gapExtendCount, int &gapDeleteCount, std::string &result) const
check a SAM format read, using phred quality scores and the CIGAR string to determine if it is correc...
genomeIndex_t getChromosomeSize(int chromosomeIndex) const
given a chromosome, return its size in bases
GenomeSequence(std::string &referenceFilename)
attempt to open an existing sequence
bool open(bool isColorSpace=false, int flags=O_RDONLY)
open the reference specified using GenomeSequence::setReferenceName
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
There are a pair of related data structures in the operating system, and also a few simple algorithms...
Definition: MemoryMap.h:156