libStatGen Software  1
ReferenceSequence.h
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef __REFERENCESEQUENCE_H
19 #define __REFERENCESEQUENCE_H
20 
21 #include "BaseAsciiMap.h"
22 #include "Generic.h"
23 #include "PackedVector.h"
24 
25 #include <algorithm>
26 #include <iostream>
27 
28 //
29 // The namespace Sequence is for templated algorithms that are by and large
30 // independent of the underlying storage mechanism for the bases.
31 //
32 // They are written in such a way as to assume that the array operator []
33 // will return an ASCII representation of a base space nucleotide.
34 //
35 // In theory, this set of templates will work with a variety of combinations
36 // of means for representing bases - String, std::string, and others.
37 //
38 // The containers are expected to allow for an overidden [] operator,
39 // and provide a size() method to return the number of bases in the
40 // container.
41 //
42 // In containers where sequence data is placed, they must in addition
43 // have a clear() method as well as a push_back() method as done
44 // in std::string containers.
45 //
46 namespace Sequence {
47 
48 //
49 // wordMatch(Sequnece, Index, Word) - compare a word to a sequence of bases
50 // Sequence is a generic container with a large set of bases
51 // Index is the starting point to start the comparison
52 // Word is the small sequence of bases to match
53 //
54 template<typename sequenceType, typename sequenceIndexType, typename wordType>
55 bool wordMatch(sequenceType &sequence, sequenceIndexType index, wordType &word)
56 {
57  if( (index + word.size()) >= sequence.size() ) return false;
58 
59  for(size_t i = 0; i < word.size(); i++) {
60  if( sequence[index + i] != word[i]) return false;
61  }
62  return true;
63 }
64 
65 //
66 // printNearbyWords(output, sequence, index, word, deviation) searches
67 // for 'deviation' bases on either side of the index into sequence
68 // and prints all occurrences where word appears.
69 //
70 template<typename sequenceType, typename sequenceIndexType, typename wordType>
71 void printNearbyWords(std::ostream &output, sequenceType &sequence, sequenceIndexType index, wordType &word, int deviation)
72 {
73  for (sequenceIndexType i = index - deviation; i < index + deviation; i++)
74  {
75  if (wordMatch(sequence, i, word))
76  {
77  output << "word '"
78  << word
79  << "' found "
80  << i - index
81  << " away from position "
82  << index
83  << "."
84  << std::endl;
85  }
86  }
87 }
88 
89 //
90 // getString(sequence, index, baseCount, word) - populate word with the 'baseCount'
91 // bases that occur at the 'index' starting position in sequence.
92 //
93 template<typename sequenceType, typename sequenceIndexType, typename wordType>
94 void getString(sequenceType &sequence, sequenceIndexType index, int baseCount, wordType &word)
95 {
96  word.clear();
97  for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++)
98  {
99  word.push_back(sequence[index + i]);
100  }
101 }
102 
103 //
104 // getHighLightedString() is a debugging aid for printing "highlighted"
105 // subsets of bases, where the highlighting is done via turning the
106 // base into a lower case ASCII equivalent.
107 //
108 template<typename sequenceType, typename sequenceIndexType, typename wordType>
109 void getHighLightedString(
110  sequenceType &sequence,
111  sequenceIndexType index,
112  int baseCount,
113  wordType &word,
114  sequenceIndexType highLightStart,
115  sequenceIndexType highLightEnd)
116 {
117  word.clear();
118  for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++)
119  {
120  char base = sequence[index+i];
121 
122  if(in(index+i, highLightStart, highLightEnd))
123  base = tolower(base);
124 
125  word.push_back(base);
126  }
127 }
128 
129 //
130 // printBaseContext() outputs a base at location 'index' along with 'baseCount'
131 // bases on either side of that base (default 30).
132 //
133 template<typename sequenceType, typename sequenceIndexType>
134 void printBaseContext(std::ostream &output, sequenceType &sequence, sequenceIndexType index, int baseCount = 30)
135 {
136  output << "index: " << index << std::endl;
137  for (sequenceIndexType i=index-baseCount; i<=index+baseCount; i++)
138  output << sequence[i];
139  output << std::endl;
140  for (sequenceIndexType i=index-baseCount; i<index; i++)
141  std::cout << " ";
142  output << "^";
143  output << std::endl;
144 }
145 
146 template<typename sequenceType, typename sequenceIndexType>
147 void getMismatchHatString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read)
148 {
149  result = "";
150  for (uint32_t i=0; i < read.size(); i++)
151  {
152  if (read[i] == sequence[location+i])
153  result.push_back(' ');
154  else
155  result.push_back('^');
156  }
157 }
158 
159 template<typename sequenceType, typename sequenceIndexType>
160 void getMismatchString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read)
161 {
162  result = "";
163  for (uint32_t i=0; i < read.size(); i++)
164  {
165  if (read[i] == sequence[location+i])
166  result.push_back(toupper(read[i]));
167  else
168  result.push_back(tolower(read[i]));
169  }
170 }
171 
172 /// Return the mismatch count, disregarding CIGAR strings
173 ///
174 /// \param read is the sequence we're counting mismatches in
175 /// \param location is where in the genmoe we start comparing
176 /// \param exclude is a wildcard character (e.g. '.' or 'N')
177 ///
178 /// \return number of bases that don't match the reference, except those that match exclude
179 
180 template<typename sequenceType, typename sequenceIndexType, typename readType>
181 int getMismatchCount(sequenceType &sequence, sequenceIndexType location, readType &read, char exclude='\0')
182 {
183  int mismatchCount = 0;
184  for (uint32_t i=0; i<read.size(); i++)
185  if (read[i]!=exclude) mismatchCount += read[i]!=sequence[location + i];
186  return mismatchCount;
187 }
188 
189 /// brute force sumQ - no sanity checking
190 ///
191 /// \param read shotgun sequencer read string
192 /// \param qualities phred quality string of same length
193 /// \param location the alignment location to check sumQ
194 template<typename sequenceType, typename sequenceIndexType, typename readType, typename qualityType>
195 int getSumQ(sequenceType &sequence, sequenceIndexType location, readType &read, qualityType &qualities)
196 {
197  int sumQ = 0;
198  for (uint32_t i=0; i<read.size(); i++)
199  sumQ += (read[i]!=sequence[location + i] ? (qualities[i]-33) : 0);
200  return sumQ;
201 };
202 
203 
204 
205 template<typename sequenceType, typename sequenceIndexType, typename readType, typename qualityType>
206 sequenceIndexType simpleLocalAligner(
207  sequenceType &sequence,
208  sequenceIndexType index,
209  readType &read,
210  qualityType &quality,
211  int windowSize)
212 {
213  int bestScore = 1000000; // either mismatch count or sumQ
214  sequenceIndexType bestMatchLocation = -1;
215  for (int i=-windowSize; i<windowSize; i++)
216  {
217  int newScore;
218 
219  // 'in' is a template, and types of all three args have to match
220  if( not in((size_t) index + i, (size_t) 0, sequence.size())) continue;
221 
222  if (quality.size() == 0)
223  {
224  newScore = getMismatchCount(sequence, index + i, read);
225  }
226  else
227  {
228  newScore = getSumQ(sequence, index + i, read, quality);
229  }
230  if (newScore < bestScore)
231  {
232  bestScore = newScore;
233  bestMatchLocation = index + i;
234  }
235  }
236  return bestMatchLocation;
237 }
238 
239 
240 }
241 
242 typedef uint32_t PackedVectorIndex_t;
243 
244 //
245 // this is actually a base space implementation, since
246 // the load/set methods do indirection from the symbol
247 // value through a symbol value to symbol lookup table.
248 //
250 {
251 
252  public:
253  inline char operator[](PackedVectorIndex_t baseIndex)
254  {
255  return BaseAsciiMap::int2base[ (static_cast<PackedVector4Bit_t>(*this))[baseIndex]];
256  }
257  inline void set(PackedVectorIndex_t baseIndex, char value)
258  {
259  this->PackedVector4Bit_t::set(baseIndex, BaseAsciiMap::base2int[(uint32_t) value]);
260  }
261  inline void push_back(char value)
262  {
263  this->PackedVector4Bit_t::push_back(BaseAsciiMap::base2int[(uint32_t) value]);
264  }
265 };
266 
267 std::ostream &operator << (std::ostream &stream, PackedSequenceData &v)
268 {
269  for(size_t i=0; i<v.size(); i++) {
270  stream << i << ": " << v[i] << std::endl;
271  }
272  return stream;
273 }
274 
275 
276 //
277 // Load a fasta format file from filename into the buffer
278 // provided by the caller.
279 // While parsing the fasta file, record each chromosome name,
280 // its start location, and its size.
281 //
282 // NB: the caller must implement the logic to determine how
283 // large the sequence data is. There is no correct way to do
284 // this, because we can't reliably estimate here how much sequence
285 // data is contained in a compressed file.
286 //
287 // To safely pre-allocate space in sequenceData, use the reserve() method
288 // before calling this function.
289 //
290 template<typename SequenceDataType, typename ChromosomeNameType>
291 bool loadFastaFile(const char *filename,
292  std::vector<SequenceDataType> &sequenceData,
293  std::vector<ChromosomeNameType> &chromosomeNames)
294 {
295  InputFile inputStream(filename, "r", InputFile::DEFAULT);
296 
297  if(!inputStream.isOpen()) {
298  std::cerr << "Failed to open file " << filename << "\n";
299  return true;
300  }
301 
302  int whichChromosome = -1;
303 
304  //
305  // chromosomeNames is cheap to clear, so do it here.
306  //
307  // NB: I explicitly choose not to clear the sequence data
308  // container, this allows the caller to pre-allocate based
309  // on their knowledge of the size of the expected genome.
310  //
311 
312  chromosomeNames.clear();
313 
314  char ch;
315  while((ch = inputStream.ifgetc()) != EOF) {
316  switch (ch)
317  {
318  case '\n':
319  case '\r':
320  break;
321  case '>':
322  {
323  std::string chromosomeName = "";
324  //
325  // pull out the chromosome new name
326  //
327  while (!isspace((ch = inputStream.ifgetc())) && ch != EOF)
328  {
329  chromosomeName += ch; // slow, but who cares
330  }
331  //
332  // eat the rest of the line
333  //
334  do {
335  ch = inputStream.ifgetc();
336  } while(ch != EOF && ch != '\n' && ch != '\r');
337  //
338  // save the Chromosome name and index into our
339  // header so we can use them later.
340  //
341  chromosomeNames.push_back(chromosomeName);
342 
343  whichChromosome++;
344 
345  sequenceData.resize(whichChromosome+1);
346 
347  break;
348  }
349  default:
350  // we get here for sequence data.
351  //
352  // save the base value
353  // Note: invalid characters come here as well, but we
354  // let ::set deal with mapping them.
355 
356  sequenceData[whichChromosome].push_back(toupper(ch));
357 #if 0
358  if (isColorSpace())
359  {
360 //
361 // anything outside these values represents an invalid base
362 // base codes: 0-> A, 1-> C, 2-> G, 3-> T
363 // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red
364 //
365  const char fromBase2CS[] =
366  {
367  /* 0000 */ 0, // A->A
368  /* 0001 */ 1, // A->C
369  /* 0010 */ 2, // A->G
370  /* 0011 */ 3, // A->T
371  /* 0100 */ 1, // C->A
372  /* 0101 */ 0, // C->C
373  /* 0110 */ 3, // C->G
374  /* 0111 */ 2, // C->T
375  /* 1000 */ 2, // G->A
376  /* 1001 */ 3, // G->C
377  /* 1010 */ 0, // G->G
378  /* 1011 */ 1, // G->T
379  /* 1100 */ 3, // T->A
380  /* 1101 */ 2, // T->C
381  /* 1110 */ 1, // T->G
382  /* 1111 */ 0, // T->T
383  };
384  //
385  // we are writing color space values on transitions,
386  // so we don't write a colorspace value when we
387  // get the first base value.
388  //
389  // On second and subsequent bases, write based on
390  // the index table above
391  //
392  char thisBase = base2int[(int)(fasta[fastaIndex])];
393  if (lastBase>=0)
394  {
395  char color;
396  if (lastBase>3 || thisBase>3) color=4;
397  else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
398  // re-use the int to base, because ::set expects a base char (ATCG), not
399  // a color code (0123). It should only matter on final output.
400  set(header->elementCount++, int2base[(int) color]);
401  }
402  lastBase = thisBase;
403  }
404  else
405  {
406  set(header->elementCount++, toupper(fasta[fastaIndex]));
407  }
408 #endif
409  break;
410  }
411  }
412  return false;
413 }
414 
415 #endif
static const char int2base[]
Convert from int representation to the base.
Definition: BaseAsciiMap.h:38
static unsigned char base2int[256+1]
Map ASCII values to a 2 (or 3) bit encoding for the base pair value for just base space (ACTGNactgn)...
Definition: BaseAsciiMap.h:61
InputFile & operator<<(InputFile &stream, const std::string &str)
Write to a file using streaming.
Definition: InputFile.h:736
Class for easily reading/writing files without having to worry about file type (uncompressed, gzip, bgzf) when reading.
Definition: InputFile.h:36
Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED.
Definition: InputFile.h:45