18 #ifndef __REFERENCESEQUENCE_H 19 #define __REFERENCESEQUENCE_H 21 #include "BaseAsciiMap.h" 23 #include "PackedVector.h" 54 template<
typename sequenceType,
typename sequenceIndexType,
typename wordType>
55 bool wordMatch(sequenceType &sequence, sequenceIndexType index, wordType &word)
57 if( (index + word.size()) >= sequence.size() )
return false;
59 for(
size_t i = 0; i < word.size(); i++) {
60 if( sequence[index + i] != word[i])
return false;
70 template<
typename sequenceType,
typename sequenceIndexType,
typename wordType>
71 void printNearbyWords(std::ostream &output, sequenceType &sequence, sequenceIndexType index, wordType &word,
int deviation)
73 for (sequenceIndexType i = index - deviation; i < index + deviation; i++)
75 if (wordMatch(sequence, i, word))
81 <<
" away from position " 93 template<
typename sequenceType,
typename sequenceIndexType,
typename wordType>
94 void getString(sequenceType &sequence, sequenceIndexType index,
int baseCount, wordType &word)
97 for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++)
99 word.push_back(sequence[index + i]);
108 template<
typename sequenceType,
typename sequenceIndexType,
typename wordType>
109 void getHighLightedString(
110 sequenceType &sequence,
111 sequenceIndexType index,
114 sequenceIndexType highLightStart,
115 sequenceIndexType highLightEnd)
118 for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++)
120 char base = sequence[index+i];
122 if(in(index+i, highLightStart, highLightEnd))
123 base = tolower(base);
125 word.push_back(base);
133 template<
typename sequenceType,
typename sequenceIndexType>
134 void printBaseContext(std::ostream &output, sequenceType &sequence, sequenceIndexType index,
int baseCount = 30)
136 output <<
"index: " << index << std::endl;
137 for (sequenceIndexType i=index-baseCount; i<=index+baseCount; i++)
138 output << sequence[i];
140 for (sequenceIndexType i=index-baseCount; i<index; i++)
146 template<
typename sequenceType,
typename sequenceIndexType>
147 void getMismatchHatString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read)
150 for (uint32_t i=0; i < read.size(); i++)
152 if (read[i] == sequence[location+i])
153 result.push_back(
' ');
155 result.push_back(
'^');
159 template<
typename sequenceType,
typename sequenceIndexType>
160 void getMismatchString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read)
163 for (uint32_t i=0; i < read.size(); i++)
165 if (read[i] == sequence[location+i])
166 result.push_back(toupper(read[i]));
168 result.push_back(tolower(read[i]));
180 template<
typename sequenceType,
typename sequenceIndexType,
typename readType>
181 int getMismatchCount(sequenceType &sequence, sequenceIndexType location, readType &read,
char exclude=
'\0')
183 int mismatchCount = 0;
184 for (uint32_t i=0; i<read.size(); i++)
185 if (read[i]!=exclude) mismatchCount += read[i]!=sequence[location + i];
186 return mismatchCount;
194 template<
typename sequenceType,
typename sequenceIndexType,
typename readType,
typename qualityType>
195 int getSumQ(sequenceType &sequence, sequenceIndexType location, readType &read, qualityType &qualities)
198 for (uint32_t i=0; i<read.size(); i++)
199 sumQ += (read[i]!=sequence[location + i] ? (qualities[i]-33) : 0);
205 template<
typename sequenceType,
typename sequenceIndexType,
typename readType,
typename qualityType>
206 sequenceIndexType simpleLocalAligner(
207 sequenceType &sequence,
208 sequenceIndexType index,
210 qualityType &quality,
213 int bestScore = 1000000;
214 sequenceIndexType bestMatchLocation = -1;
215 for (
int i=-windowSize; i<windowSize; i++)
220 if( not in((
size_t) index + i, (
size_t) 0, sequence.size()))
continue;
222 if (quality.size() == 0)
224 newScore = getMismatchCount(sequence, index + i, read);
228 newScore = getSumQ(sequence, index + i, read, quality);
230 if (newScore < bestScore)
232 bestScore = newScore;
233 bestMatchLocation = index + i;
236 return bestMatchLocation;
242 typedef uint32_t PackedVectorIndex_t;
253 inline char operator[](PackedVectorIndex_t baseIndex)
257 inline void set(PackedVectorIndex_t baseIndex,
char value)
261 inline void push_back(
char value)
269 for(
size_t i=0; i<v.size(); i++) {
270 stream << i <<
": " << v[i] << std::endl;
290 template<
typename SequenceDataType,
typename ChromosomeNameType>
291 bool loadFastaFile(
const char *filename,
292 std::vector<SequenceDataType> &sequenceData,
293 std::vector<ChromosomeNameType> &chromosomeNames)
297 if(!inputStream.isOpen()) {
298 std::cerr <<
"Failed to open file " << filename <<
"\n";
302 int whichChromosome = -1;
312 chromosomeNames.clear();
315 while((ch = inputStream.ifgetc()) != EOF) {
323 std::string chromosomeName =
"";
327 while (!isspace((ch = inputStream.ifgetc())) && ch != EOF)
329 chromosomeName += ch;
335 ch = inputStream.ifgetc();
336 }
while(ch != EOF && ch !=
'\n' && ch !=
'\r');
341 chromosomeNames.push_back(chromosomeName);
345 sequenceData.resize(whichChromosome+1);
356 sequenceData[whichChromosome].push_back(toupper(ch));
365 const char fromBase2CS[] =
392 char thisBase = base2int[(int)(fasta[fastaIndex])];
396 if (lastBase>3 || thisBase>3) color=4;
397 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
400 set(header->elementCount++, int2base[(int) color]);
406 set(header->elementCount++, toupper(fasta[fastaIndex]));
static const char int2base[]
Convert from int representation to the base.
static unsigned char base2int[256+1]
Map ASCII values to a 2 (or 3) bit encoding for the base pair value for just base space (ACTGNactgn)...