libStatGen Software  1
CigarRoller.cpp
1 /*
2  * Copyright (C) 2010-2011 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <ctype.h>
21 #include "CigarRoller.h"
22 
23 ////////////////////////////////////////////////////////////////////////
24 //
25 // Cigar Roller Class
26 //
27 
28 
30 {
31  std::vector<CigarOperator>::iterator i;
32  for (i = rhs.cigarOperations.begin(); i != rhs.cigarOperations.end(); i++)
33  {
34  (*this) += *i;
35  }
36  return *this;
37 }
38 
39 
40 //
41 // Append a new operator at the end of the sequence.
42 //
44 {
45  // Adding to the cigar, so the query & reference indexes would be
46  // incomplete, so just clear them.
47  clearQueryAndReferenceIndexes();
48 
49  if (rhs.count==0)
50  {
51  // nothing to do
52  }
53  else if (cigarOperations.empty() || cigarOperations.back() != rhs)
54  {
55  cigarOperations.push_back(rhs);
56  }
57  else
58  {
59  // last stored operation is the same as the new one, so just add it in
60  cigarOperations.back().count += rhs.count;
61  }
62  return *this;
63 }
64 
65 
67 {
68  clear();
69 
70  (*this) += rhs;
71 
72  return *this;
73 }
74 
75 
76 //
77 void CigarRoller::Add(Operation operation, int count)
78 {
79  CigarOperator rhs(operation, count);
80  (*this) += rhs;
81 }
82 
83 
84 void CigarRoller::Add(char operation, int count)
85 {
86  switch (operation)
87  {
88  case 0:
89  case 'M':
90  Add(match, count);
91  break;
92  case 1:
93  case 'I':
94  Add(insert, count);
95  break;
96  case 2:
97  case 'D':
98  Add(del, count);
99  break;
100  case 3:
101  case 'N':
102  Add(skip, count);
103  break;
104  case 4:
105  case 'S':
106  Add(softClip, count);
107  break;
108  case 5:
109  case 'H':
110  Add(hardClip, count);
111  break;
112  case 6:
113  case 'P':
114  Add(pad, count);
115  break;
116  case 7:
117  case '=':
118  Add(match, count);
119  break;
120  case 8:
121  case 'X':
122  Add(match, count);
123  break;
124  default:
125  // Hmmm... what to do?
126  std::cerr << "ERROR "
127  << "(" << __FILE__ << ":" << __LINE__ <<"): "
128  << "Parsing CIGAR - invalid character found "
129  << "with parameter " << operation << " and " << count
130  << std::endl;
131  break;
132  }
133 }
134 
135 
136 void CigarRoller::Add(const char *cigarString)
137 {
138  int operationCount = 0;
139  while (*cigarString)
140  {
141  if (isdigit(*cigarString))
142  {
143  char *endPtr;
144  operationCount = strtol((char *) cigarString, &endPtr, 10);
145  cigarString = endPtr;
146  }
147  else
148  {
149  Add(*cigarString, operationCount);
150  cigarString++;
151  }
152  }
153 }
154 
155 
156 bool CigarRoller::Remove(int index)
157 {
158  if((index < 0) || ((unsigned int)index >= cigarOperations.size()))
159  {
160  // can't remove, out of range, return false.
161  return(false);
162  }
163  cigarOperations.erase(cigarOperations.begin() + index);
164  // Modifying the cigar, so the query & reference indexes are out of date,
165  // so clear them.
166  clearQueryAndReferenceIndexes();
167  return(true);
168 }
169 
170 
171 bool CigarRoller::IncrementCount(int index, int increment)
172 {
173  if((index < 0) || ((unsigned int)index >= cigarOperations.size()))
174  {
175  // can't update, out of range, return false.
176  return(false);
177  }
178  cigarOperations[index].count += increment;
179 
180  // Modifying the cigar, so the query & reference indexes are out of date,
181  // so clear them.
182  clearQueryAndReferenceIndexes();
183  return(true);
184 }
185 
186 
187 bool CigarRoller::Update(int index, Operation op, int count)
188 {
189  if((index < 0) || ((unsigned int)index >= cigarOperations.size()))
190  {
191  // can't update, out of range, return false.
192  return(false);
193  }
194  cigarOperations[index].operation = op;
195  cigarOperations[index].count = count;
196 
197  // Modifying the cigar, so the query & reference indexes are out of date,
198  // so clear them.
199  clearQueryAndReferenceIndexes();
200  return(true);
201 }
202 
203 
204 void CigarRoller::Set(const char *cigarString)
205 {
206  clear();
207  Add(cigarString);
208 }
209 
210 
211 void CigarRoller::Set(const uint32_t* cigarBuffer, uint16_t bufferLen)
212 {
213  clear();
214 
215  // Parse the buffer.
216  for (int i = 0; i < bufferLen; i++)
217  {
218  int opLen = cigarBuffer[i] >> 4;
219 
220  Add(cigarBuffer[i] & 0xF, opLen);
221  }
222 }
223 
224 
225 //
226 // when we examine CIGAR strings, we need to know how
227 // many cumulative insert and delete positions there are
228 // so that we can adjust the read location appropriately.
229 //
230 // Here, we iterate over the vector of CIGAR operations,
231 // summaring the count for each insert or delete (insert
232 // increases the offset, delete decreases it).
233 //
234 // The use case for this is when we have a genome match
235 // position based on an index word other than the first one,
236 // and there is also a insert or delete between the beginning
237 // of the read and the index word. We can't simply report
238 // the match position without taking into account the indels,
239 // otherwise we'll be off by N where N is the sum of this
240 // indel count.
241 //
242 // DEPRECATED - do not use. There are better ways to accomplish that by using
243 // read lengths, reference lengths, span of the read, etc.
245 {
246  int offset = 0;
247  std::vector<CigarOperator>::iterator i;
248 
249  for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
250  {
251  switch (i->operation)
252  {
253  case insert:
254  offset += i->count;
255  break;
256  case del:
257  offset -= i->count;
258  break;
259  // TODO anything for case skip:????
260  default:
261  break;
262  }
263  }
264  return offset;
265 }
266 
267 
268 //
269 // Get the string reprentation of the Cigar operations in this object.
270 // Caller must delete the returned value.
271 //
273 {
274  // NB: the exact size of the string is not important, it just needs to be guaranteed
275  // larger than the largest number of characters we could put into it.
276 
277  // we do not explicitly manage memory usage, and we expect when program exits, the memory used here will be freed
278  static char *ret = NULL;
279  static unsigned int retSize = 0;
280 
281  if (ret == NULL)
282  {
283  retSize = cigarOperations.size() * 12 + 1; // 12 == a magic number -> > 1 + log base 10 of MAXINT
284  ret = (char*) malloc(sizeof(char) * retSize);
285  assert(ret != NULL);
286 
287  }
288  else
289  {
290  // currently, ret pointer has enough memory to use
291  if (retSize > cigarOperations.size() * 12 + 1)
292  {
293  }
294  else
295  {
296  retSize = cigarOperations.size() * 12 + 1;
297  free(ret);
298  ret = (char*) malloc(sizeof(char) * retSize);
299  }
300  assert(ret != NULL);
301  }
302 
303  char *ptr = ret;
304  char buf[12]; // > 1 + log base 10 of MAXINT
305 
306  std::vector<CigarOperator>::iterator i;
307 
308  // Progressively append the character representations of the operations to
309  // the cigar string we allocated above.
310 
311  *ptr = '\0'; // clear result string
312  for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
313  {
314  sprintf(buf, "%d%c", (*i).count, (*i).getChar());
315  strcat(ptr, buf);
316  while (*ptr)
317  {
318  ptr++; // limit the cost of strcat above
319  }
320  }
321  return ret;
322 }
323 
324 
326 {
327  // Clearing the cigar, so the query & reference indexes are out of
328  // date, so clear them.
329  clearQueryAndReferenceIndexes();
330  cigarOperations.clear();
331 }
void Set(const char *cigarString)
Sets this object to the specified cigarString.
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
Padding (not in reference or query). Associated with CIGAR Operation "P".
Definition: Cigar.h:96
Operation
Enum for the cigar operations.
Definition: Cigar.h:87
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
Definition: CigarRoller.cpp:77
bool Update(int index, Operation op, int count)
Updates the operation at the specified index to be the specified operation and have the specified cou...
skipped region from the reference (the reference contains bases that have no corresponding base in th...
Definition: Cigar.h:93
CigarRoller & operator+=(CigarRoller &rhs)
Add the contents of the specified CigarRoller to this object.
Definition: CigarRoller.cpp:29
Hard clip on the read (clipped sequence not present in the query sequence or reference). Associated with CIGAR Operation "H".
Definition: Cigar.h:95
void clear()
Clear this object so that it has no Cigar Operations.
const char * getString()
Get the string reprentation of the Cigar operations in this object, caller must delete the returned v...
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)...
Definition: Cigar.h:94
match/mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:89
bool IncrementCount(int index, int increment)
Increments the count for the operation at the specified index by the specified value, specify a negative value to decrement.
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition: Cigar.h:92
int getMatchPositionOffset()
DEPRECATED - do not use, there are better ways to accomplish that by using read lengths, reference lengths, span of the read, etc.
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object...
Definition: CigarRoller.h:66
CigarRoller & operator=(CigarRoller &rhs)
Set this object to be equal to the specified CigarRoller.
Definition: CigarRoller.cpp:66
bool Remove(int index)
Remove the operation at the specified index.