libStatGen Software  1
GenomeSequence Class Reference

Create/Access/Modify/Load Genome Sequences stored as binary mapped files. More...

#include <GenomeSequence.h>

Inheritance diagram for GenomeSequence:
Collaboration diagram for GenomeSequence:

Public Member Functions

 GenomeSequence ()
 Simple constructor - no implicit file open.
 
void constructorClear ()
 
 GenomeSequence (std::string &referenceFilename)
 attempt to open an existing sequence More...
 
 GenomeSequence (const char *referenceFilename)
 Smarter constructor - attempt to open an existing sequence. More...
 
 ~GenomeSequence ()
 Close the file if open and destroy the object.
 
bool open (bool isColorSpace=false, int flags=O_RDONLY)
 open the reference specified using GenomeSequence::setReferenceName More...
 
bool open (const char *filename, int flags=O_RDONLY)
 open the given file as the genome (no filename munging occurs). More...
 
bool create (bool isColor=false)
 
void setProgressStream (std::ostream &progressStream)
 if set, then show progress when creating and pre-fetching
 
void setColorSpace (bool colorSpace)
 
void setSearchCommonFileSuffix (bool searchCommonFileSuffix)
 
void setCreateOverwrite (bool createOverwrite)
 
bool loadFastaData (const char *filename)
 
bool setReferenceName (std::string referenceFilename)
 set the reference name that will be used in open() More...
 
void setApplication (std::string application)
 set the application name in the binary file header More...
 
const std::string & getFastaName () const
 
const std::string & getReferenceName () const
 
bool isColorSpace () const
 tell us if we are a color space reference or not More...
 
genomeIndex_t getNumberBases () const
 return the number of bases represented in this reference More...
 
int getChromosome (genomeIndex_t position) const
 given a whole genome index, get the chromosome it is located in More...
 
int getChromosome (const char *chromosomeName) const
 given a chromosome name, return the chromosome index More...
 
int getChromosomeCount () const
 Return the number of chromosomes in the genome. More...
 
genomeIndex_t getChromosomeStart (int chromosomeIndex) const
 given a chromosome, return the genome base it starts in More...
 
genomeIndex_t getChromosomeSize (int chromosomeIndex) const
 given a chromosome, return its size in bases More...
 
genomeIndex_t getGenomePosition (const char *chromosomeName, unsigned int chromosomeIndex) const
 given a chromosome name and position, return the genome position More...
 
genomeIndex_t getGenomePosition (int chromosome, unsigned int chromosomeIndex) const
 given a chromosome index and position, return the genome position More...
 
genomeIndex_t getGenomePosition (const char *chromosomeName) const
 given the chromosome name, get the corresponding 0 based genome index for the start of that chromosome
 
genomeIndex_t getGenomePosition (int chromosomeIndex) const
 
const std::string & getBaseFilename () const
 
const char * getChromosomeName (int chromosomeIndex) const
 
void setDebugFlag (bool d)
 
genomeIndex_t sequenceLength () const
 
const char * chromosomeName (int chr) const
 
void sanityCheck (MemoryMap &fasta) const
 
std::string IntegerToSeq (unsigned int n, unsigned int wordsize) const
 
bool wordMatch (unsigned int index, std::string &word) const
 
bool printNearbyWords (unsigned int index, unsigned int variance, std::string &word) const
 
char BasePair (char c) const
 
void dumpSequenceSAMDictionary (std::ostream &) const
 
void dumpHeaderTSV (std::ostream &) const
 
char operator[] (genomeIndex_t index) const
 Return the bases in base space or color space for within range index, ot. More...
 
char getBase (const char *chromosomeName, unsigned int chromosomeIndex) const
 given a chromosome name and 1-based position, return the reference base. More...
 
uint8_t getInteger (genomeIndex_t index) const
 
void set (genomeIndex_t index, char value)
 
uint8_t * getDataPtr (genomeIndex_t index)
 obtain the pointer to the raw data for other access methods More...
 
void getReverseRead (std::string &read)
 
void getReverseRead (String &read)
 
int debugPrintReadValidation (std::string &read, std::string &quality, char direction, genomeIndex_t readLocation, int sumQuality, int mismatchCount, bool recurse=true)
 
void getString (std::string &str, int chromosome, uint32_t index, int baseCount) const
 
void getString (String &str, int chromosome, uint32_t index, int baseCount) const
 
void getString (std::string &str, genomeIndex_t index, int baseCount) const
 
void getString (String &str, genomeIndex_t index, int baseCount) const
 
void getHighLightedString (std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const
 
void print30 (genomeIndex_t) const
 
genomeIndex_t simpleLocalAligner (std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const
 
int getMismatchCount (std::string &read, genomeIndex_t location, char exclude='\0') const
 Return the mismatch count, disregarding CIGAR strings. More...
 
int getSumQ (std::string &read, std::string &qualities, genomeIndex_t location) const
 brute force sumQ - no sanity checking More...
 
void getMismatchHatString (std::string &result, const std::string &read, genomeIndex_t location) const
 
void getMismatchString (std::string &result, const std::string &read, genomeIndex_t location) const
 
void getChromosomeAndIndex (std::string &, genomeIndex_t) const
 
void getChromosomeAndIndex (String &, genomeIndex_t) const
 
bool checkRead (std::string &read, std::string &qualities, std::string &cigar, int &sumQ, int &gapOpenCount, int &gapExtendCount, int &gapDeleteCount, std::string &result) const
 check a SAM format read, using phred quality scores and the CIGAR string to determine if it is correct. More...
 
bool populateDBSNP (mmapArrayBool_t &dbSNP, IFILE inputFile) const
 
bool loadDBSNP (mmapArrayBool_t &dbSNP, const char *inputFileName) const
 user friendly dbSNP loader. More...
 
- Public Member Functions inherited from MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >
void constructorClear ()
 
const std::string & getErrorString ()
 
arrayHeaderClass & getHeader ()
 
void setContentCookie (uint32_t c)
 
void setContentVersion (uint32_t v)
 
elementT operator[] (indexT i)
 
void set (indexT i, elementT v)
 
int create (const char *file, indexT elementCount, int optionalHeaderCount=0)
 Create a vector with elementCount memebers. More...
 
int create (indexT elementCount, int optionalHeaderCount=0)
 allow anonymous (malloc) create. More...
 
bool open (const char *file, int flags=O_RDONLY)
 open a previously created mapped vector More...
 
bool close ()
 
void debugPrint (FILE *f)
 
size_t getElementCount () const
 
- Public Member Functions inherited from MemoryMap
void debug_print ()
 
void constructor_clear ()
 
void destructor_clear ()
 
virtual bool allocate ()
 
virtual bool create (const char *file, size_t size)
 create the memory mapped file on disk More...
 
virtual bool create (size_t size)
 store in allocated memory (malloc), not mmap: More...
 
bool close ()
 
void test ()
 
size_t length ()
 
char operator[] (unsigned int index)
 
int prefetch ()
 
void useMemoryMap (bool flag=true)
 

Additional Inherited Members

- Public Attributes inherited from MemoryMap
void * data
 
- Protected Attributes inherited from MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >
arrayHeaderClass * header
 
char * data
 
std::string errorStr
 

Detailed Description

Create/Access/Modify/Load Genome Sequences stored as binary mapped files.

GenomeSequence is designed to be a high performance shared access reference object.

It is implemented as a MemoryMapArray template object with unsigned 8 bit ints, each of which stores two bases. Although 2 bits could be used, most references have more than four symbols (usually at least including 'N', indicating an unknown or masked out base).

Normal use of this class follows these steps:

  1. create the reference
    1. instantiate the GenomeSequence class object
    2. create the actual file (memory mapped) that is to hold the data
    3. populate the data using GenomeSequence::set
  2. use the reference
    1. use the reference by instantiating a GenomeSequence object
    2. either use the constructor with the reference filename
    3. or use GenomeSequence::setReferenceName() followed by ::open
    4. access the bases via the overloaded array operator []
    5. check sequence length by using GenomeSequence::getNumberBases()
  3. accessing chromosomes in the reference
    1. you typically will need to know about the chromosomes in the sequence
    2. see methods and docs with prefix 'getChromosome'

Sharing is accomplished using the mmap() function via the MemoryMap base class. This allows a potentially large genome reference to be shared among a number of simultaneously executing instances of one or more programs sharing the same reference.

Definition at line 99 of file GenomeSequence.h.

Constructor & Destructor Documentation

◆ GenomeSequence() [1/2]

GenomeSequence::GenomeSequence ( std::string &  referenceFilename)
inline

attempt to open an existing sequence

Parameters
referenceFilenamethe name of the reference fasta file to open
debugif true, additional debug information is printed

Definition at line 128 of file GenomeSequence.h.

129  {
130  constructorClear();
131  setup(referenceFilename.c_str());
132  }

◆ GenomeSequence() [2/2]

GenomeSequence::GenomeSequence ( const char *  referenceFilename)
inline

Smarter constructor - attempt to open an existing sequence.

Parameters
referenceFilenamethe name of the reference fasta file to open
debugif true, additional debug information is printed

Definition at line 138 of file GenomeSequence.h.

139  {
140  constructorClear();
141  setup(referenceFilename);
142  }

Member Function Documentation

◆ checkRead()

bool GenomeSequence::checkRead ( std::string &  read,
std::string &  qualities,
std::string &  cigar,
int &  sumQ,
int &  gapOpenCount,
int &  gapExtendCount,
int &  gapDeleteCount,
std::string &  result 
) const

check a SAM format read, using phred quality scores and the CIGAR string to determine if it is correct.

Parameters
readthe read in base space
qualitiesthe phred encoded qualities (Sanger, not Illumina)
cigarthe SAM file CIGAR column
sumQif >0 on entry, is checked against the computed sumQ
insertionscount of insertions found in

◆ getBase()

char GenomeSequence::getBase ( const char *  chromosomeName,
unsigned int  chromosomeIndex 
) const
inline

given a chromosome name and 1-based position, return the reference base.

Parameters
chromosomeNamename of the chromosome - exact match only
chromosomeIndex1-based chromosome position
Returns
reference base at the above chromosome position

Definition at line 388 of file GenomeSequence.h.

390  {
391  genomeIndex_t index =
392  getGenomePosition(chromosomeName, chromosomeIndex);
393  if(index == INVALID_GENOME_INDEX)
394  {
395  // Invalid position, so return 'N'
396  return('N');
397  }
398  return((*this)[index]);
399  }
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position

References getGenomePosition().

Referenced by PileupElement::getRefBase().

◆ getChromosome() [1/2]

int GenomeSequence::getChromosome ( const char *  chromosomeName) const

given a chromosome name, return the chromosome index

This is done via a linear search of the chromosome table in the header of the mapped file, so it is O(N)

Parameters
chromosomeNamethe name of the chromosome - exact match only
Returns
0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error

Definition at line 814 of file GenomeSequence.cpp.

815 {
816  unsigned int i;
817  for (i=0; i<header->_chromosomeCount; i++)
818  {
819  if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
820  {
821  return i;
822  }
823  }
824  return INVALID_CHROMOSOME_INDEX;
825 }

◆ getChromosome() [2/2]

int GenomeSequence::getChromosome ( genomeIndex_t  position) const

given a whole genome index, get the chromosome it is located in

This is done via a binary search of the chromosome table in the header of the mapped file, so it is O(log(N))

Parameters
0-basedposition the base in the genome
Returns
0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error

Definition at line 737 of file GenomeSequence.cpp.

738 {
739  if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
740 
741  if (header->_chromosomeCount == 0)
742  return INVALID_CHROMOSOME_INDEX;
743 
744  int start = 0;
745  int stop = header->_chromosomeCount - 1;
746 
747  // eliminate case where position is in the last chromosome, since the loop
748  // below falls off the end of the list if it in the last one.
749 
750  if (position > header->_chromosomes[stop].start)
751  return (stop);
752 
753  while (start <= stop)
754  {
755  int middle = (start + stop) / 2;
756 
757  if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
758  return middle;
759 
760  if (position == header->_chromosomes[middle + 1].start)
761  return (middle + 1);
762 
763  if (position > header->_chromosomes[middle + 1].start)
764  start = middle + 1;
765 
766  if (position < header->_chromosomes[middle].start)
767  stop = middle - 1;
768  }
769 
770  return -1;
771 }

Referenced by getGenomePosition().

◆ getChromosomeCount()

int GenomeSequence::getChromosomeCount ( ) const

Return the number of chromosomes in the genome.

Returns
number of chromosomes in the genome

Definition at line 731 of file GenomeSequence.cpp.

732 {
733  return header->_chromosomeCount;
734 }

◆ getChromosomeSize()

genomeIndex_t GenomeSequence::getChromosomeSize ( int  chromosomeIndex) const
inline

given a chromosome, return its size in bases

Parameters
0-basedchromosome index
Returns
size of the chromosome in bases

Definition at line 256 of file GenomeSequence.h.

257  {
258  if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
259  return header->_chromosomes[chromosomeIndex].size;
260  }

◆ getChromosomeStart()

genomeIndex_t GenomeSequence::getChromosomeStart ( int  chromosomeIndex) const
inline

given a chromosome, return the genome base it starts in

Parameters
0-basedchromosome index
Returns
0-based genome index of the base that starts the chromosome

Definition at line 246 of file GenomeSequence.h.

247  {
248  if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
249  return header->_chromosomes[chromosomeIndex].start;
250  }

◆ getDataPtr()

uint8_t* GenomeSequence::getDataPtr ( genomeIndex_t  index)
inline

obtain the pointer to the raw data for other access methods

this is a fairly ugly hack to reach into the raw genome vector, get the byte that encodes two bases, and return it. This is used by karma ReadIndexer::getSumQ to compare genome matchines by byte (two bases at a time) to speed it up.

Definition at line 422 of file GenomeSequence.h.

423  {
424  return ((uint8_t *) data + index/2);
425  }

◆ getGenomePosition() [1/2]

genomeIndex_t GenomeSequence::getGenomePosition ( const char *  chromosomeName,
unsigned int  chromosomeIndex 
) const

given a chromosome name and position, return the genome position

Parameters
chromosomeNamename of the chromosome - exact match only
chromosomeIndex1-based chromosome position
Returns
genome index of the above chromosome position

Definition at line 779 of file GenomeSequence.cpp.

782 {
783  genomeIndex_t i = getGenomePosition(chromosomeName);
784  if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
785  return i + chromosomeIndex - 1;
786 }

Referenced by SamTags::createMDTag(), getBase(), SamQuerySeqWithRefIter::reset(), SamQuerySeqWithRef::seqWithEquals(), and SamQuerySeqWithRef::seqWithoutEquals().

◆ getGenomePosition() [2/2]

genomeIndex_t GenomeSequence::getGenomePosition ( int  chromosome,
unsigned int  chromosomeIndex 
) const

given a chromosome index and position, return the genome position

Parameters
chromosomeindex of the chromosome
chromosomeIndex1-based chromosome position
Returns
genome index of the above chromosome position

Definition at line 788 of file GenomeSequence.cpp.

791 {
792  if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
793 
794  genomeIndex_t i = header->_chromosomes[chromosome].start;
795  if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
796  return i + chromosomeIndex - 1;
797 }

◆ getMismatchCount()

int GenomeSequence::getMismatchCount ( std::string &  read,
genomeIndex_t  location,
char  exclude = '\0' 
) const
inline

Return the mismatch count, disregarding CIGAR strings.

Parameters
readis the sequence we're counting mismatches in
locationis where in the genmoe we start comparing
excludeis a wildcard character (e.g. '.' or 'N')
Returns
number of bases that don't match the reference, except those that match exclude

Definition at line 488 of file GenomeSequence.h.

489  {
490  int mismatchCount = 0;
491  for (uint32_t i=0; i<read.size(); i++)
492  if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
493  return mismatchCount;
494  };

◆ getNumberBases()

genomeIndex_t GenomeSequence::getNumberBases ( ) const
inline

return the number of bases represented in this reference

Returns
count of bases

Definition at line 216 of file GenomeSequence.h.

217  {
218  return getElementCount();
219  }

Referenced by loadDBSNP(), and operator[]().

◆ getSumQ()

int GenomeSequence::getSumQ ( std::string &  read,
std::string &  qualities,
genomeIndex_t  location 
) const
inline

brute force sumQ - no sanity checking

Parameters
readshotgun sequencer read string
qualitiesphred quality string of same length
locationthe alignment location to check sumQ

Definition at line 501 of file GenomeSequence.h.

502  {
503  int sumQ = 0;
504  for (uint32_t i=0; i<read.size(); i++)
505  sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
506  return sumQ;
507  };

◆ isColorSpace()

bool GenomeSequence::isColorSpace ( ) const
inline

tell us if we are a color space reference or not

Returns
true if colorspace, false otherwise

Definition at line 209 of file GenomeSequence.h.

210  {
211  return _colorSpace;
212  }

Referenced by open(), and operator[]().

◆ loadDBSNP()

bool GenomeSequence::loadDBSNP ( mmapArrayBool_t dbSNP,
const char *  inputFileName 
) const

user friendly dbSNP loader.

Parameters
inputFileNamemay be empty, point to a text file or a dbSNP vector file

In all cases, dbSNP is returned the same length as this genome.

When no SNPs are loaded, all values are false.

When a text file is given, the file is parsed with two space separated columns - the first column is the chromosome name, and the second is the 1-based chromosome position of the SNP.

Returns
false if a dbSNP file was correctly loaded, true otherwise

Definition at line 1301 of file GenomeSequence.cpp.

1304 {
1305  //
1306  // the goal in this section of code is to allow the user
1307  // to either specify a valid binary version of the SNP file,
1308  // or the original text file that it gets created from.
1309  //
1310  // To do this, we basically open, sniff the error message,
1311  // and if it claims it is not a binary version of the file,
1312  // we go ahead and treat it as the text file and use the
1313  // GenomeSequence::populateDBSNP method to load it.
1314  //
1315  // Further checking is really needed to ensure users don't
1316  // mix a dbSNP file for a different reference, since it is really
1317  // easy to do.
1318  //
1319  if (strlen(inputFileName)!=0)
1320  {
1321  std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
1322 
1323  if (dbSNP.open(inputFileName, O_RDONLY))
1324  {
1325  //
1326  // failed to open, possibly due to bad magic.
1327  //
1328  // this is really awful ... need to have a return
1329  // code that is smart enough to avoid this ugliness:
1330  //
1331  if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
1332  {
1333  std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
1334  exit(1);
1335  }
1336  //
1337  // we have a file, assume we can load it as a text file
1338  //
1339  IFILE inputFile = ifopen(inputFileName, "r");
1340  if(inputFile == NULL)
1341  {
1342  std::cerr << "Error: failed to open " << inputFileName << std::endl;
1343  exit(1);
1344  }
1345 
1346  std::cerr << "(as text file) ";
1347 
1348  // anonymously (RAM resident only) create:
1349  dbSNP.create(getNumberBases());
1350 
1351  // now load it into RAM
1352  populateDBSNP(dbSNP, inputFile);
1353  ifclose(inputFile);
1354 
1355  }
1356  else
1357  {
1358  std::cerr << "(as binary mapped file) ";
1359  }
1360 
1361  std::cerr << "DONE!" << std::endl;
1362  return false;
1363  }
1364  else
1365  {
1366  return true;
1367  }
1368 }
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
genomeIndex_t getNumberBases() const
return the number of bases represented in this reference
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
bool open(const char *file, int flags=O_RDONLY)
open a previously created mapped vector
int create(const char *file, indexT elementCount, int optionalHeaderCount=0)
Create a vector with elementCount memebers.

References MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >::create(), getNumberBases(), ifclose(), ifopen(), and MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >::open().

◆ open() [1/2]

bool GenomeSequence::open ( bool  isColorSpace = false,
int  flags = O_RDONLY 
)

open the reference specified using GenomeSequence::setReferenceName

Parameters
isColorSpaceopen the color space reference
flagspass through to the ::open() call (O_RDWR lets you modify the contents)
Returns
false for success, true otherwise

Definition at line 182 of file GenomeSequence.cpp.

183 {
184  bool rc;
185 
186  if (isColorSpace)
187  {
188  _umfaFilename = _baseFilename + "-cs.umfa";
189  }
190  else
191  {
192  _umfaFilename = _baseFilename + "-bs.umfa";
193  }
194 
195  if(access(_umfaFilename.c_str(), R_OK) != 0)
196  {
197  // umfa file doesn't exist, so try to create it.
198  if(create(isColorSpace))
199  {
200  // Couldon't access or create the umfa.
201  std::cerr << "GenomeSequence::open: failed to open file "
202  << _umfaFilename
203  << " also failed creating it."
204  << std::endl;
205  return true;
206  }
207  }
208 
209  rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
210  if (rc)
211  {
212  std::cerr << "GenomeSequence::open: failed to open file "
213  << _umfaFilename
214  << std::endl;
215  return true;
216  }
217 
218  _colorSpace = header->_colorSpace;
219 
220  return false;
221 }
bool isColorSpace() const
tell us if we are a color space reference or not

References isColorSpace(), and MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >::open().

◆ open() [2/2]

bool GenomeSequence::open ( const char *  filename,
int  flags = O_RDONLY 
)
inlinevirtual

open the given file as the genome (no filename munging occurs).

Parameters
filenamethe name of the file to open
flagspass through to the ::open() call (O_RDWR lets you modify the contents)
Returns
false for success, true otherwise

Reimplemented from MemoryMap.

Definition at line 159 of file GenomeSequence.h.

160  {
161  _umfaFilename = filename;
162  // TODO - should this method be doing something???
163  return false;
164  }

◆ operator[]()

char GenomeSequence::operator[] ( genomeIndex_t  index) const
inline

Return the bases in base space or color space for within range index, ot.

Parameters
indexthe array-like index (0 based).
Returns
ACTGN in base space; 0123N for color space; and 'N' for invalid. For color space, index i represents the transition of base at position (i-1) to base at position i

NB: bounds checking here needs to be deprecated - do not assume it will exist - the call must clip reads so that this routine is never called with a index value larger than the genome.

The reason for this is simply that this routine gets called hundreds of billions of time in one run of karma, which will absolutely kill performance. Every single instruction here matters a great, great deal.

Definition at line 361 of file GenomeSequence.h.

362  {
363  uint8_t val;
364  if (index < getNumberBases())
365  {
366  if ((index&1)==0)
367  {
368  val = ((uint8_t *) data)[index>>1] & 0xf;
369  }
370  else
371  {
372  val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
373  }
374  }
375  else
376  {
378  }
381  return val;
382  }
static const char int2colorSpace[]
Convert from int representation to colorspace representation.
Definition: BaseAsciiMap.h:40
static const char int2base[]
Convert from int representation to the base.
Definition: BaseAsciiMap.h:38
static const int baseNIndex
Value associated with 'N' in the ascii to base map (bad read).
Definition: BaseAsciiMap.h:28

References BaseAsciiMap::baseNIndex, getNumberBases(), BaseAsciiMap::int2base, BaseAsciiMap::int2colorSpace, and isColorSpace().

◆ setApplication()

void GenomeSequence::setApplication ( std::string  application)
inline

set the application name in the binary file header

Parameters
applicationname of the application

Definition at line 194 of file GenomeSequence.h.

195  {
196  _application = application; // used in ::create() to set application name
197  }

◆ setReferenceName()

bool GenomeSequence::setReferenceName ( std::string  referenceFilename)

set the reference name that will be used in open()

Parameters
referenceFilenamethe name of the reference fasta file to open
Returns
false for success, true otherwise
See also
open()

Definition at line 254 of file GenomeSequence.cpp.

255 {
256 
257  if (HAS_SUFFIX(referenceFilename, ".fa"))
258  {
259  _referenceFilename = referenceFilename;
260  _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
261  }
262  else if (HAS_SUFFIX(referenceFilename, ".umfa"))
263  {
264  _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
265  }
266  else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
267  {
268  _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
269  }
270  else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
271  {
272  _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
273  }
274  else
275  {
276  _baseFilename = referenceFilename;
277  }
278  _fastaFilename = _baseFilename + ".fa";
279 
280  if (HAS_SUFFIX(referenceFilename, ".fasta"))
281  {
282  _referenceFilename = referenceFilename;
283  _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 6);
284  _fastaFilename = _baseFilename + ".fasta";
285  }
286 
287  return false;
288 }

The documentation for this class was generated from the following files: