18 #ifndef _GENOME_SEQUENCE_H
19 #define _GENOME_SEQUENCE_H
21 #include <sys/types.h>
23 #if !defined(MD5_DIGEST_LENGTH)
24 #define MD5_DIGEST_LENGTH 16
27 #include "MemoryMapArray.h"
28 #include "BaseAsciiMap.h"
31 #include "StringArray.h"
38 #define UINT32_MAX 0xFFFFFFFF
41 typedef uint32_t genomeIndex_t;
42 #define INVALID_GENOME_INDEX UINT32_MAX
45 #define INVALID_CHROMOSOME_INDEX -1
47 #include "GenomeSequenceHelpers.h"
49 #define UMFA_COOKIE 0x1b7933a1
50 #define UMFA_VERSION 20100401U
59 Packed4BitElementCount2Bytes,
104 std::ostream *_progressStream;
108 bool _createOverwrite;
110 std::string _baseFilename;
111 std::string _referenceFilename;
112 std::string _fastaFilename;
113 std::string _umfaFilename;
114 std::string _application;
118 void setup(
const char *referenceFilename);
123 void constructorClear();
131 setup(referenceFilename.c_str());
141 setup(referenceFilename);
159 bool open(
const char *filename,
int flags = O_RDONLY)
161 _umfaFilename = filename;
167 bool _searchCommonFileSuffix;
169 bool create(
bool isColor =
false);
176 void setColorSpace(
bool colorSpace) {_colorSpace = colorSpace; }
177 void setSearchCommonFileSuffix(
bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
180 void setCreateOverwrite(
bool createOverwrite) {_createOverwrite = createOverwrite;}
182 bool loadFastaData(
const char *filename);
196 _application = application;
198 const std::string &getFastaName()
const
200 return _fastaFilename;
202 const std::string &getReferenceName()
const
204 return _referenceFilename;
218 return getElementCount();
248 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX)
return INVALID_GENOME_INDEX;
249 return header->_chromosomes[chromosomeIndex].start;
258 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX)
return 0;
259 return header->_chromosomes[chromosomeIndex].size;
268 const char *chromosomeName,
269 unsigned int chromosomeIndex)
const;
278 unsigned int chromosomeIndex)
const;
285 const std::string &getBaseFilename()
const
287 return _baseFilename;
290 const char *getChromosomeName(
int chromosomeIndex)
const
292 return header->_chromosomes[chromosomeIndex].name;
295 void setDebugFlag(
bool d)
300 genomeIndex_t sequenceLength()
const
302 return (genomeIndex_t) header->elementCount;
305 const char *chromosomeName(
int chr)
const
307 return header->_chromosomes[chr].name;
310 void sanityCheck(
MemoryMap &fasta)
const;
313 std::string IntegerToSeq(
unsigned int n,
unsigned int wordsize)
const;
315 bool wordMatch(
unsigned int index, std::string &word)
const;
316 bool printNearbyWords(
unsigned int index,
unsigned int variance, std::string &word)
const;
319 char BasePair(
char c)
const
324 void dumpSequenceSAMDictionary(std::ostream&)
const;
325 void dumpHeaderTSV(std::ostream&)
const;
355 inline char operator[](genomeIndex_t index)
const
368 val = ((uint8_t *) data)[index>>1] & 0xf;
372 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
388 inline char getBase(
const char *chromosomeName,
389 unsigned int chromosomeIndex)
const
391 genomeIndex_t index =
393 if(index == INVALID_GENOME_INDEX)
398 return((*
this)[index]);
402 inline uint8_t getInteger(genomeIndex_t index)
const
407 inline void set(genomeIndex_t index,
char value)
409 genomeSequenceArray::set(index,
424 return ((uint8_t *) data + index/2);
432 bool setChromosomeMD5andLength(uint32_t whichChromosome);
438 void getReverseRead(std::string &read);
441 void getReverseRead(
String& read);
444 int debugPrintReadValidation(
446 std::string &quality,
448 genomeIndex_t readLocation,
458 void getString(std::string &str,
int chromosome, uint32_t index,
int baseCount)
const;
459 void getString(
String &str,
int chromosome, uint32_t index,
int baseCount)
const;
465 void getString(std::string &str, genomeIndex_t index,
int baseCount)
const;
466 void getString(
String &str, genomeIndex_t index,
int baseCount)
const;
468 void getHighLightedString(std::string &str, genomeIndex_t index,
int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd)
const;
470 void print30(genomeIndex_t)
const;
473 genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index,
int windowSize)
const;
490 int mismatchCount = 0;
491 for (uint32_t i=0; i<read.size(); i++)
492 if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
493 return mismatchCount;
501 int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location)
const
504 for (uint32_t i=0; i<read.size(); i++)
505 sumQ += (read[i]!=(*
this)[location + i] ? (qualities[i]-33) : 0);
509 void getMismatchHatString(std::string &result,
const std::string &read, genomeIndex_t location)
const;
510 void getMismatchString(std::string &result,
const std::string &read, genomeIndex_t location)
const;
514 void getChromosomeAndIndex(std::string &, genomeIndex_t)
const;
515 void getChromosomeAndIndex(
String &, genomeIndex_t)
const;
530 std::string &qualities,
static unsigned char base2complement[]
This table maps 5' base space to the 3' complement base space values, as well as 5' color space value...
static unsigned char base2int[256+1]
Map ASCII values to a 2 (or 3) bit encoding for the base pair value for just base space (ACTGNactgn).
static const char int2colorSpace[]
Convert from int representation to colorspace representation.
static const char int2base[]
Convert from int representation to the base.
static const int baseNIndex
Value associated with 'N' in the ascii to base map (bad read).
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const
user friendly dbSNP loader.
char getBase(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and 1-based position, return the reference base.
int getChromosomeCount() const
Return the number of chromosomes in the genome.
GenomeSequence(const char *referenceFilename)
Smarter constructor - attempt to open an existing sequence.
int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
brute force sumQ - no sanity checking
int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
Return the mismatch count, disregarding CIGAR strings.
void setApplication(std::string application)
set the application name in the binary file header
genomeIndex_t getChromosomeStart(int chromosomeIndex) const
given a chromosome, return the genome base it starts in
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
char operator[](genomeIndex_t index) const
Return the bases in base space or color space for within range index, ot.
uint8_t * getDataPtr(genomeIndex_t index)
obtain the pointer to the raw data for other access methods
~GenomeSequence()
Close the file if open and destroy the object.
bool isColorSpace() const
tell us if we are a color space reference or not
void setProgressStream(std::ostream &progressStream)
if set, then show progress when creating and pre-fetching
int getChromosome(genomeIndex_t position) const
given a whole genome index, get the chromosome it is located in
bool setReferenceName(std::string referenceFilename)
set the reference name that will be used in open()
bool open(const char *filename, int flags=O_RDONLY)
open the given file as the genome (no filename munging occurs).
genomeIndex_t getNumberBases() const
return the number of bases represented in this reference
GenomeSequence()
Simple constructor - no implicit file open.
bool checkRead(std::string &read, std::string &qualities, std::string &cigar, int &sumQ, int &gapOpenCount, int &gapExtendCount, int &gapDeleteCount, std::string &result) const
check a SAM format read, using phred quality scores and the CIGAR string to determine if it is correc...
genomeIndex_t getChromosomeSize(int chromosomeIndex) const
given a chromosome, return its size in bases
GenomeSequence(std::string &referenceFilename)
attempt to open an existing sequence
bool open(bool isColorSpace=false, int flags=O_RDONLY)
open the reference specified using GenomeSequence::setReferenceName
There are a pair of related data structures in the operating system, and also a few simple algorithms...