libStatGen Software  1
SamRecord.cpp
1 /*
2  * Copyright (C) 2010-2012 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdlib.h>
19 #include <limits>
20 #include <stdexcept>
21 
22 #include "bam.h"
23 
24 #include "SamRecord.h"
25 #include "SamValidation.h"
26 
27 #include "BaseUtilities.h"
28 #include "SamQuerySeqWithRefHelper.h"
29 
30 const char* SamRecord::DEFAULT_READ_NAME = "UNKNOWN";
31 const char* SamRecord::FIELD_ABSENT_STRING = "=";
32 int SamRecord::myNumWarns = 0;
33 
35  : myStatus(),
36  myRefPtr(NULL),
37  mySequenceTranslation(NONE)
38 {
39  int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t);
40 
41  myRecordPtr =
42  (bamRecordStruct *) malloc(defaultAllocSize);
43 
44  myCigarTempBuffer = NULL;
45  myCigarTempBufferAllocatedSize = 0;
46 
47  allocatedSize = defaultAllocSize;
48 
49  resetRecord();
50 }
51 
52 
54  : myStatus(errorHandlingType),
55  myRefPtr(NULL),
56  mySequenceTranslation(NONE)
57 {
58  int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t);
59 
60  myRecordPtr =
61  (bamRecordStruct *) malloc(defaultAllocSize);
62 
63  myCigarTempBuffer = NULL;
64  myCigarTempBufferAllocatedSize = 0;
65 
66  allocatedSize = defaultAllocSize;
67 
68  resetRecord();
69 }
70 
71 
73 {
74  resetRecord();
75 
76  if(myRecordPtr != NULL)
77  {
78  free(myRecordPtr);
79  myRecordPtr = NULL;
80  }
81  if(myCigarTempBuffer != NULL)
82  {
83  free(myCigarTempBuffer);
84  myCigarTempBuffer = NULL;
85  myCigarTempBufferAllocatedSize = 0;
86  }
87 }
88 
89 
90 // Resets the fields of the record to a default value.
92 {
93  myIsBufferSynced = true;
94 
95  myRecordPtr->myBlockSize = DEFAULT_BLOCK_SIZE;
96  myRecordPtr->myReferenceID = -1;
97  myRecordPtr->myPosition = -1;
98  myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
99  myRecordPtr->myMapQuality = 0;
100  myRecordPtr->myBin = DEFAULT_BIN;
101  myRecordPtr->myCigarLength = 0;
102  myRecordPtr->myFlag = 0;
103  myRecordPtr->myReadLength = 0;
104  myRecordPtr->myMateReferenceID = -1;
105  myRecordPtr->myMatePosition = -1;
106  myRecordPtr->myInsertSize = 0;
107 
108  // Set the sam values for the variable length fields.
109  // TODO - one way to speed this up might be to not set to "*" and just
110  // clear them, and write out a '*' for SAM if it is empty.
111  myReadName = DEFAULT_READ_NAME;
112  myReferenceName = "*";
113  myMateReferenceName = "*";
114  myCigar = "*";
115  mySequence = "*";
116  mySeqWithEq.clear();
117  mySeqWithoutEq.clear();
118  myQuality = "*";
119  myNeedToSetTagsFromBuffer = false;
120  myNeedToSetTagsInBuffer = false;
121 
122  // Initialize the calculated alignment info to the uncalculated value.
123  myAlignmentLength = -1;
124  myUnclippedStartOffset = -1;
125  myUnclippedEndOffset = -1;
126 
127  clearTags();
128 
129  // Set the bam values for the variable length fields.
130  // Only the read name needs to be set, the others are a length of 0.
131  // Set the read name. The min size of myRecordPtr includes the size for
132  // the default read name.
133  memcpy(&(myRecordPtr->myData), myReadName.c_str(),
134  myRecordPtr->myReadNameLength);
135 
136  // Set that the variable length buffer fields are valid.
137  myIsReadNameBufferValid = true;
138  myIsCigarBufferValid = true;
139  myPackedSequence =
140  (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength +
141  myRecordPtr->myCigarLength * sizeof(int);
142  myIsSequenceBufferValid = true;
143  myBufferSequenceTranslation = NONE;
144 
145  myPackedQuality = myPackedSequence;
146  myIsQualityBufferValid = true;
147  myIsTagsBufferValid = true;
148  myIsBinValid = true;
149 
150  myCigarTempBufferLength = -1;
151 
152  myStatus = SamStatus::SUCCESS;
153 
154  NOT_FOUND_TAG_STRING = "";
155  NOT_FOUND_TAG_INT = -1; // TODO - deprecate
156 }
157 
158 
159 // Returns whether or not the record is valid.
160 // Header is needed to perform some validation against it.
162 {
163  myStatus = SamStatus::SUCCESS;
164  SamValidationErrors invalidSamErrors;
165  if(!SamValidator::isValid(header, *this, invalidSamErrors))
166  {
167  // The record is not valid.
168  std::string errorMessage = "";
169  invalidSamErrors.getErrorString(errorMessage);
170  myStatus.setStatus(SamStatus::INVALID, errorMessage.c_str());
171  return(false);
172  }
173  // The record is valid.
174  return(true);
175 }
176 
177 
179 {
180  myRefPtr = reference;
181 }
182 
183 
184 // Set the type of sequence translation to use when getting
185 // the sequence. The default type (if this method is never called) is
186 // NONE (the sequence is left as-is). This is used
188 {
189  mySequenceTranslation = translation;
190 }
191 
192 
193 bool SamRecord::setReadName(const char* readName)
194 {
195  myReadName = readName;
196  myIsBufferSynced = false;
197  myIsReadNameBufferValid = false;
198  myStatus = SamStatus::SUCCESS;
199 
200  // The read name must at least have some length, otherwise this is a parsing
201  // error.
202  if(myReadName.Length() == 0)
203  {
204  // Invalid - reset ReadName return false.
205  myReadName = DEFAULT_READ_NAME;
206  myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
207  myStatus.setStatus(SamStatus::INVALID, "0 length Query Name.");
208  return(false);
209  }
210 
211  return true;
212 }
213 
214 
215 bool SamRecord::setFlag(uint16_t flag)
216 {
217  myStatus = SamStatus::SUCCESS;
218  myRecordPtr->myFlag = flag;
219  return true;
220 }
221 
222 
224  const char* referenceName)
225 {
226  myStatus = SamStatus::SUCCESS;
227 
228  myReferenceName = referenceName;
229  // If the reference ID does not already exist, add it (pass true)
230  myRecordPtr->myReferenceID = header.getReferenceID(referenceName, true);
231 
232  return true;
233 }
234 
235 
236 bool SamRecord::set1BasedPosition(int32_t position)
237 {
238  return(set0BasedPosition(position - 1));
239 }
240 
241 
242 bool SamRecord::set0BasedPosition(int32_t position)
243 {
244  myStatus = SamStatus::SUCCESS;
245  myRecordPtr->myPosition = position;
246  myIsBinValid = false;
247  return true;
248 }
249 
250 
251 bool SamRecord::setMapQuality(uint8_t mapQuality)
252 {
253  myStatus = SamStatus::SUCCESS;
254  myRecordPtr->myMapQuality = mapQuality;
255  return true;
256 }
257 
258 
259 bool SamRecord::setCigar(const char* cigar)
260 {
261  myStatus = SamStatus::SUCCESS;
262  myCigar = cigar;
263 
264  myIsBufferSynced = false;
265  myIsCigarBufferValid = false;
266  myCigarTempBufferLength = -1;
267  myIsBinValid = false;
268 
269  // Initialize the calculated alignment info to the uncalculated value.
270  myAlignmentLength = -1;
271  myUnclippedStartOffset = -1;
272  myUnclippedEndOffset = -1;
273 
274  return true;
275 }
276 
277 
278 bool SamRecord::setCigar(const Cigar& cigar)
279 {
280  myStatus = SamStatus::SUCCESS;
281  cigar.getCigarString(myCigar);
282 
283  myIsBufferSynced = false;
284  myIsCigarBufferValid = false;
285  myCigarTempBufferLength = -1;
286  myIsBinValid = false;
287 
288  // Initialize the calculated alignment info to the uncalculated value.
289  myAlignmentLength = -1;
290  myUnclippedStartOffset = -1;
291  myUnclippedEndOffset = -1;
292 
293  return true;
294 }
295 
296 
298  const char* mateReferenceName)
299 {
300  myStatus = SamStatus::SUCCESS;
301  // Set the mate reference, if it is "=", set it to be equal
302  // to myReferenceName. This assumes that myReferenceName has already
303  // been called.
304  if(strcmp(mateReferenceName, FIELD_ABSENT_STRING) == 0)
305  {
306  myMateReferenceName = myReferenceName;
307  }
308  else
309  {
310  myMateReferenceName = mateReferenceName;
311  }
312 
313  // Set the Mate Reference ID.
314  // If the reference ID does not already exist, add it (pass true)
315  myRecordPtr->myMateReferenceID =
316  header.getReferenceID(myMateReferenceName, true);
317 
318  return true;
319 }
320 
321 
322 bool SamRecord::set1BasedMatePosition(int32_t matePosition)
323 {
324  return(set0BasedMatePosition(matePosition - 1));
325 }
326 
327 
328 bool SamRecord::set0BasedMatePosition(int32_t matePosition)
329 {
330  myStatus = SamStatus::SUCCESS;
331  myRecordPtr->myMatePosition = matePosition;
332  return true;
333 }
334 
335 
336 bool SamRecord::setInsertSize(int32_t insertSize)
337 {
338  myStatus = SamStatus::SUCCESS;
339  myRecordPtr->myInsertSize = insertSize;
340  return true;
341 }
342 
343 
344 bool SamRecord::setSequence(const char* seq)
345 {
346  myStatus = SamStatus::SUCCESS;
347  mySequence = seq;
348  mySeqWithEq.clear();
349  mySeqWithoutEq.clear();
350 
351  myIsBufferSynced = false;
352  myIsSequenceBufferValid = false;
353  return true;
354 }
355 
356 
357 bool SamRecord::setQuality(const char* quality)
358 {
359  myStatus = SamStatus::SUCCESS;
360  myQuality = quality;
361  myIsBufferSynced = false;
362  myIsQualityBufferValid = false;
363  return true;
364 }
365 
366 
367 //Shift indels to the left
369 {
370  // Check to see whether or not the Cigar has already been
371  // set - this is determined by checking if alignment length
372  // is set since alignment length and the cigar are set
373  // at the same time.
374  if(myAlignmentLength == -1)
375  {
376  // Not been set, so calculate it.
377  parseCigar();
378  }
379 
380  // Track whether or not there was a shift.
381  bool shifted = false;
382 
383  // Cigar is set, so now myCigarRoller can be used.
384  // Track where in the read we are.
385  uint32_t currentPos = 0;
386 
387  // Since the loop starts at 1 because the first operation can't be shifted,
388  // increment the currentPos past the first operation.
389  if(Cigar::foundInQuery(myCigarRoller[0]))
390  {
391  // This op was found in the read, increment the current position.
392  currentPos += myCigarRoller[0].count;
393  }
394 
395  int numOps = myCigarRoller.size();
396 
397  // Loop through the cigar operations from the 2nd operation since
398  // the first operation is already on the end and can't shift.
399  for(int currentOp = 1; currentOp < numOps; currentOp++)
400  {
401  if(myCigarRoller[currentOp].operation == Cigar::insert)
402  {
403  // For now, only shift a max of 1 operation.
404  int prevOpIndex = currentOp-1;
405  // Track the next op for seeing if it is the same as the
406  // previous for merging reasons.
407  int nextOpIndex = currentOp+1;
408  if(nextOpIndex == numOps)
409  {
410  // There is no next op, so set it equal to the current one.
411  nextOpIndex = currentOp;
412  }
413  // The start of the previous operation, so we know when we hit it
414  // so we don't shift past it.
415  uint32_t prevOpStart =
416  currentPos - myCigarRoller[prevOpIndex].count;
417 
418  // We can only shift if the previous operation
419  if(!Cigar::isMatchOrMismatch(myCigarRoller[prevOpIndex]))
420  {
421  // TODO - shift past pads
422  // An insert is in the read, so increment the position.
423  currentPos += myCigarRoller[currentOp].count;
424  // Not a match/mismatch, so can't shift into it.
425  continue;
426  }
427 
428  // It is a match or mismatch, so check to see if we can
429  // shift into it.
430 
431  // The end of the insert is calculated by adding the size
432  // of this insert minus 1 to the start of the insert.
433  uint32_t insertEndPos =
434  currentPos + myCigarRoller[currentOp].count - 1;
435 
436  // The insert starts at the current position.
437  uint32_t insertStartPos = currentPos;
438 
439  // Loop as long as the position before the insert start
440  // matches the last character in the insert. If they match,
441  // the insert can be shifted one index left because the
442  // implied reference will not change. If they do not match,
443  // we can't shift because the implied reference would change.
444  // Stop loop when insertStartPos = prevOpStart, because we
445  // don't want to move past that.
446  while((insertStartPos > prevOpStart) &&
447  (getSequence(insertEndPos,BASES) ==
448  getSequence(insertStartPos - 1, BASES)))
449  {
450  // We can shift, so move the insert start & end one left.
451  --insertEndPos;
452  --insertStartPos;
453  }
454 
455  // Determine if a shift has occurred.
456  int shiftLen = currentPos - insertStartPos;
457  if(shiftLen > 0)
458  {
459  // Shift occured, so adjust the cigar if the cigar will
460  // not become more operations.
461  // If the next operation is the same as the previous or
462  // if the insert and the previous operation switch positions
463  // then the cigar has the same number of operations.
464  // If the next operation is different, and the shift splits
465  // the previous operation in 2, then the cigar would
466  // become longer, so we do not want to shift.
467  if(myCigarRoller[nextOpIndex].operation ==
468  myCigarRoller[prevOpIndex].operation)
469  {
470  // The operations are the same, so merge them by adding
471  // the length of the shift to the next operation.
472  myCigarRoller.IncrementCount(nextOpIndex, shiftLen);
473  myCigarRoller.IncrementCount(prevOpIndex, -shiftLen);
474 
475  // If the previous op length is 0, just remove that
476  // operation.
477  if(myCigarRoller[prevOpIndex].count == 0)
478  {
479  myCigarRoller.Remove(prevOpIndex);
480  }
481  shifted = true;
482  }
483  else
484  {
485  // Can only shift if the insert shifts past the
486  // entire previous operation, otherwise an operation
487  // would need to be added.
488  if(insertStartPos == prevOpStart)
489  {
490  // Swap the positions of the insert and the
491  // previous operation.
492  myCigarRoller.Update(currentOp,
493  myCigarRoller[prevOpIndex].operation,
494  myCigarRoller[prevOpIndex].count);
495  // Size of the previous op is the entire
496  // shift length.
497  myCigarRoller.Update(prevOpIndex,
499  shiftLen);
500  shifted = true;
501  }
502  }
503  }
504  // An insert is in the read, so increment the position.
505  currentPos += myCigarRoller[currentOp].count;
506  }
507  else if(Cigar::foundInQuery(myCigarRoller[currentOp]))
508  {
509  // This op was found in the read, increment the current position.
510  currentPos += myCigarRoller[currentOp].count;
511  }
512  }
513  if(shifted)
514  {
515  // TODO - setCigar is currently inefficient because later the cigar
516  // roller will be recalculated, but for now it will work.
517  setCigar(myCigarRoller);
518  }
519  return(shifted);
520 }
521 
522 
523 // Set the BAM record from the passeed in buffer of the specified size.
524 // Note: The size includes the block size.
526  uint32_t fromBufferSize,
527  SamFileHeader& header)
528 {
529  myStatus = SamStatus::SUCCESS;
530  if((fromBuffer == NULL) || (fromBufferSize == 0))
531  {
532  // Buffer is empty.
534  "Cannot parse an empty file.");
535  return(SamStatus::FAIL_PARSE);
536  }
537 
538  // Clear the record.
539  resetRecord();
540 
541  // allocate space for the record size.
542  if(!allocateRecordStructure(fromBufferSize))
543  {
544  // Failed to allocate space.
545  return(SamStatus::FAIL_MEM);
546  }
547 
548  memcpy(myRecordPtr, fromBuffer, fromBufferSize);
549 
550  setVariablesForNewBuffer(header);
551 
552  // Return the status of the record.
553  return(SamStatus::SUCCESS);
554 }
555 
556 
557 // Read the BAM record from a file.
559  SamFileHeader& header)
560 {
561  myStatus = SamStatus::SUCCESS;
562  if((filePtr == NULL) || (filePtr->isOpen() == false))
563  {
564  // File is not open, return failure.
566  "Can't read from an unopened file.");
567  return(SamStatus::FAIL_ORDER);
568  }
569 
570  // Clear the record.
571  resetRecord();
572 
573  // read the record size.
574  int numBytes =
575  ifread(filePtr, &(myRecordPtr->myBlockSize), sizeof(int32_t));
576 
577  // Check to see if the end of the file was hit and no bytes were read.
578  if(ifeof(filePtr) && (numBytes == 0))
579  {
580  // End of file, nothing was read, no more records.
581  std::string statusMsg = "No more records left to read, ";
582  statusMsg += filePtr->getFileName();
583  statusMsg += ".";
585  statusMsg.c_str());
586  return(SamStatus::NO_MORE_RECS);
587  }
588 
589  if(numBytes != sizeof(int32_t))
590  {
591  // Failed to read the entire block size. Either the end of the file
592  // was reached early or there was an error.
593  if(ifeof(filePtr))
594  {
595  // Error: end of the file reached prior to reading the rest of the
596  // record.
597  std::string statusMsg = "EOF reached in the middle of a record, ";
598  statusMsg += filePtr->getFileName();
599  statusMsg += ".";
601  statusMsg.c_str());
602  return(SamStatus::FAIL_PARSE);
603  }
604  else
605  {
606  // Error reading.
607  std::string statusMsg = "Failed to read the record size, ";
608  statusMsg += filePtr->getFileName();
609  statusMsg += ".";
610  myStatus.setStatus(SamStatus::FAIL_IO,
611  statusMsg.c_str());
612  return(SamStatus::FAIL_IO);
613  }
614  }
615 
616  // allocate space for the record size.
617  if(!allocateRecordStructure(myRecordPtr->myBlockSize + sizeof(int32_t)))
618  {
619  // Failed to allocate space.
620  // Status is set by allocateRecordStructure.
621  return(SamStatus::FAIL_MEM);
622  }
623 
624  // Read the rest of the alignment block, starting at the reference id.
625  if(ifread(filePtr, &(myRecordPtr->myReferenceID), myRecordPtr->myBlockSize)
626  != (unsigned int)myRecordPtr->myBlockSize)
627  {
628  // Error reading the record. Reset it and return failure.
629  resetRecord();
630  std::string statusMsg = "Failed to read the record, ";
631  statusMsg += filePtr->getFileName();
632  statusMsg += ".";
633  myStatus.setStatus(SamStatus::FAIL_IO,
634  statusMsg.c_str());
635  return(SamStatus::FAIL_IO);
636  }
637 
638  setVariablesForNewBuffer(header);
639 
640  // Return the status of the record.
641  return(SamStatus::SUCCESS);
642 }
643 
644 
645 // Add the specified tag to the record.
646 // Returns true if the tag was successfully added, false otherwise.
647 bool SamRecord::addIntTag(const char* tag, int32_t value)
648 {
649  myStatus = SamStatus::SUCCESS;
650  int key = 0;
651  int index = 0;
652  char bamvtype;
653 
654  int tagBufferSize = 0;
655 
656  // First check to see if the tags need to be synced to the buffer.
657  if(myNeedToSetTagsFromBuffer)
658  {
659  if(!setTagsFromBuffer())
660  {
661  // Failed to read tags from the buffer, so cannot add new ones.
662  return(false);
663  }
664  }
665 
666  // Ints come in as int. But it can be represented in fewer bits.
667  // So determine a more specific type that is in line with the
668  // types for BAM files.
669  // First check to see if it is a negative.
670  if(value < 0)
671  {
672  // The int is negative, so it will need to use a signed type.
673  // See if it is greater than the min value for a char.
674  if(value > ((std::numeric_limits<char>::min)()))
675  {
676  // It can be stored in a signed char.
677  bamvtype = 'c';
678  tagBufferSize += 4;
679  }
680  else if(value > ((std::numeric_limits<short>::min)()))
681  {
682  // It fits in a signed short.
683  bamvtype = 's';
684  tagBufferSize += 5;
685  }
686  else
687  {
688  // Just store it as a signed int.
689  bamvtype = 'i';
690  tagBufferSize += 7;
691  }
692  }
693  else
694  {
695  // It is positive, so an unsigned type can be used.
696  if(value < ((std::numeric_limits<unsigned char>::max)()))
697  {
698  // It is under the max of an unsigned char.
699  bamvtype = 'C';
700  tagBufferSize += 4;
701  }
702  else if(value < ((std::numeric_limits<unsigned short>::max)()))
703  {
704  // It is under the max of an unsigned short.
705  bamvtype = 'S';
706  tagBufferSize += 5;
707  }
708  else
709  {
710  // Just store it as an unsigned int.
711  bamvtype = 'I';
712  tagBufferSize += 7;
713  }
714  }
715 
716  // Check to see if the tag is already there.
717  key = MAKEKEY(tag[0], tag[1], bamvtype);
718  unsigned int hashIndex = extras.Find(key);
719  if(hashIndex != LH_NOTFOUND)
720  {
721  // Tag was already found.
722  index = extras[hashIndex];
723 
724  // Since the tagBufferSize was already updated with the new value,
725  // subtract the size for the previous tag (even if they are the same).
726  switch(intType[index])
727  {
728  case 'c':
729  case 'C':
730  case 'A':
731  tagBufferSize -= 4;
732  break;
733  case 's':
734  case 'S':
735  tagBufferSize -= 5;
736  break;
737  case 'i':
738  case 'I':
739  tagBufferSize -= 7;
740  break;
741  default:
742  myStatus.setStatus(SamStatus::INVALID,
743  "unknown tag inttype type found.\n");
744  return(false);
745  }
746 
747  // Tag already existed, print message about overwriting.
748  // WARN about dropping duplicate tags.
749  if(myNumWarns++ < myMaxWarns)
750  {
751  String newVal;
752  String origVal;
753  appendIntArrayValue(index, origVal);
754  appendIntArrayValue(bamvtype, value, newVal);
755  fprintf(stderr, "WARNING: Duplicate Tags, overwritting %c%c:%c:%s with %c%c:%c:%s\n",
756  tag[0], tag[1], intType[index], origVal.c_str(), tag[0], tag[1], bamvtype, newVal.c_str());
757  if(myNumWarns == myMaxWarns)
758  {
759  fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n");
760  }
761  }
762 
763  // Update the integer value and type.
764  integers[index] = value;
765  intType[index] = bamvtype;
766  }
767  else
768  {
769  // Tag is not already there, so add it.
770  index = integers.Length();
771 
772  integers.Push(value);
773  intType.push_back(bamvtype);
774 
775  extras.Add(key, index);
776  }
777 
778  // The buffer tags are now out of sync.
779  myNeedToSetTagsInBuffer = true;
780  myIsTagsBufferValid = false;
781  myIsBufferSynced = false;
782  myTagBufferSize += tagBufferSize;
783 
784  return(true);
785 }
786 
787 
788 // Add the specified tag to the record, replacing it if it is already there and
789 // is different from the previous value.
790 // Returns true if the tag was successfully added (or was already there), false otherwise.
791 bool SamRecord::addTag(const char* tag, char vtype, const char* valuePtr)
792 {
793  if(vtype == 'i')
794  {
795  // integer type. Call addIntTag to handle it.
796  int intVal = atoi(valuePtr);
797  return(addIntTag(tag, intVal));
798  }
799 
800  // Non-int type.
801  myStatus = SamStatus::SUCCESS;
802  bool status = true; // default to successful.
803  int key = 0;
804  int index = 0;
805 
806  int tagBufferSize = 0;
807 
808  // First check to see if the tags need to be synced to the buffer.
809  if(myNeedToSetTagsFromBuffer)
810  {
811  if(!setTagsFromBuffer())
812  {
813  // Failed to read tags from the buffer, so cannot add new ones.
814  return(false);
815  }
816  }
817 
818  // First check to see if the tag is already there.
819  key = MAKEKEY(tag[0], tag[1], vtype);
820  unsigned int hashIndex = extras.Find(key);
821  if(hashIndex != LH_NOTFOUND)
822  {
823  // The key was found in the hash, so get the lookup index.
824  index = extras[hashIndex];
825 
826  String origTag;
827  char origType = vtype;
828 
829  // Adjust the currently pointed to value to the new setting.
830  switch (vtype)
831  {
832  case 'A' :
833  // First check to see if the value changed.
834  if((integers[index] == (const int)*(valuePtr)) &&
835  (intType[index] == vtype))
836  {
837  // The value & type has not changed, so do nothing.
838  return(true);
839  }
840  else
841  {
842  // Tag buffer size changes if type changes, so subtract & add.
843  origType = intType[index];
844  appendIntArrayValue(index, origTag);
845  tagBufferSize -= getNumericTagTypeSize(intType[index]);
846  tagBufferSize += getNumericTagTypeSize(vtype);
847  integers[index] = (const int)*(valuePtr);
848  intType[index] = vtype;
849  }
850  break;
851  case 'Z' :
852  // First check to see if the value changed.
853  if(strings[index] == valuePtr)
854  {
855  // The value has not changed, so do nothing.
856  return(true);
857  }
858  else
859  {
860  // Adjust the tagBufferSize by removing the size of the old string.
861  origTag = strings[index];
862  tagBufferSize -= strings[index].Length();
863  strings[index] = valuePtr;
864  // Adjust the tagBufferSize by adding the size of the new string.
865  tagBufferSize += strings[index].Length();
866  }
867  break;
868  case 'B' :
869  // First check to see if the value changed.
870  if(strings[index] == valuePtr)
871  {
872  // The value has not changed, so do nothing.
873  return(true);
874  }
875  else
876  {
877  // Adjust the tagBufferSize by removing the size of the old field.
878  origTag = strings[index];
879  tagBufferSize -= getBtagBufferSize(strings[index]);
880  strings[index] = valuePtr;
881  // Adjust the tagBufferSize by adding the size of the new field.
882  tagBufferSize += getBtagBufferSize(strings[index]);
883  }
884  break;
885  case 'f' :
886  // First check to see if the value changed.
887  if(floats[index] == (float)atof(valuePtr))
888  {
889  // The value has not changed, so do nothing.
890  return(true);
891  }
892  else
893  {
894  // Tag buffer size doesn't change between different 'f' entries.
895  origTag.appendFullFloat(floats[index]);
896  floats[index] = (float)atof(valuePtr);
897  }
898  break;
899  default :
900  fprintf(stderr,
901  "samRecord::addTag() - Unknown custom field of type %c\n",
902  vtype);
904  "Unknown custom field in a tag");
905  status = false;
906  break;
907  }
908 
909  // Duplicate tag in this record.
910  // Tag already existed, print message about overwriting.
911  // WARN about dropping duplicate tags.
912  if(myNumWarns++ < myMaxWarns)
913  {
914  fprintf(stderr, "WARNING: Duplicate Tags, overwritting %c%c:%c:%s with %c%c:%c:%s\n",
915  tag[0], tag[1], origType, origTag.c_str(), tag[0], tag[1], vtype, valuePtr);
916  if(myNumWarns == myMaxWarns)
917  {
918  fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n");
919  }
920  }
921  }
922  else
923  {
924  // The key was not found in the hash, so add it.
925  switch (vtype)
926  {
927  case 'A' :
928  index = integers.Length();
929  integers.Push((const int)*(valuePtr));
930  intType.push_back(vtype);
931  tagBufferSize += 4;
932  break;
933  case 'Z' :
934  index = strings.Length();
935  strings.Push(valuePtr);
936  tagBufferSize += 4 + strings.Last().Length();
937  break;
938  case 'B' :
939  index = strings.Length();
940  strings.Push(valuePtr);
941  tagBufferSize += 3 + getBtagBufferSize(strings[index]);
942  break;
943  case 'f' :
944  index = floats.size();
945  floats.push_back((float)atof(valuePtr));
946  tagBufferSize += 7;
947  break;
948  default :
949  fprintf(stderr,
950  "samRecord::addTag() - Unknown custom field of type %c\n",
951  vtype);
953  "Unknown custom field in a tag");
954  status = false;
955  break;
956  }
957  if(status)
958  {
959  // If successful, add the key to extras.
960  extras.Add(key, index);
961  }
962  }
963 
964  // Only add the tag if it has so far been successfully processed.
965  if(status)
966  {
967  // The buffer tags are now out of sync.
968  myNeedToSetTagsInBuffer = true;
969  myIsTagsBufferValid = false;
970  myIsBufferSynced = false;
971  myTagBufferSize += tagBufferSize;
972  }
973  return(status);
974 }
975 
976 
978 {
979  if(extras.Entries() != 0)
980  {
981  extras.Clear();
982  }
983  strings.Clear();
984  integers.Clear();
985  intType.clear();
986  floats.clear();
987  myTagBufferSize = 0;
988  resetTagIter();
989 }
990 
991 
992 bool SamRecord::rmTag(const char* tag, char type)
993 {
994  // Check the length of tag.
995  if(strlen(tag) != 2)
996  {
997  // Tag is the wrong length.
998  myStatus.setStatus(SamStatus::INVALID,
999  "rmTag called with tag that is not 2 characters\n");
1000  return(false);
1001  }
1002 
1003  myStatus = SamStatus::SUCCESS;
1004  if(myNeedToSetTagsFromBuffer)
1005  {
1006  if(!setTagsFromBuffer())
1007  {
1008  // Failed to read the tags from the buffer, so cannot
1009  // get tags.
1010  return(false);
1011  }
1012  }
1013 
1014  // Construct the key.
1015  int key = MAKEKEY(tag[0], tag[1], type);
1016  // Look to see if the key exsists in the hash.
1017  int offset = extras.Find(key);
1018 
1019  if(offset < 0)
1020  {
1021  // Not found, so return true, successfully removed since
1022  // it is not in tag.
1023  return(true);
1024  }
1025 
1026  // Offset is set, so the key was found.
1027  // First if it is an integer, determine the actual type of the int.
1028  char vtype;
1029  getTypeFromKey(key, vtype);
1030  if(vtype == 'i')
1031  {
1032  vtype = getIntegerType(offset);
1033  }
1034 
1035  // Offset is set, so recalculate the buffer size without this entry.
1036  // Do NOT remove from strings, integers, or floats because then
1037  // extras would need to be updated for all entries with the new indexes
1038  // into those variables.
1039  int rmBuffSize = 0;
1040  switch(vtype)
1041  {
1042  case 'A':
1043  case 'c':
1044  case 'C':
1045  rmBuffSize = 4;
1046  break;
1047  case 's':
1048  case 'S':
1049  rmBuffSize = 5;
1050  break;
1051  case 'i':
1052  case 'I':
1053  rmBuffSize = 7;
1054  break;
1055  case 'f':
1056  rmBuffSize = 7;
1057  break;
1058  case 'Z':
1059  rmBuffSize = 4 + getString(offset).Length();
1060  break;
1061  case 'B':
1062  rmBuffSize = 3 + getBtagBufferSize(getString(offset));
1063  break;
1064  default:
1065  myStatus.setStatus(SamStatus::INVALID,
1066  "rmTag called with unknown type.\n");
1067  return(false);
1068  break;
1069  };
1070 
1071  // The buffer tags are now out of sync.
1072  myNeedToSetTagsInBuffer = true;
1073  myIsTagsBufferValid = false;
1074  myIsBufferSynced = false;
1075  myTagBufferSize -= rmBuffSize;
1076 
1077  // Remove from the hash.
1078  extras.Delete(offset);
1079  return(true);
1080 }
1081 
1082 
1083 bool SamRecord::rmTags(const char* tags)
1084 {
1085  const char* currentTagPtr = tags;
1086 
1087  myStatus = SamStatus::SUCCESS;
1088  if(myNeedToSetTagsFromBuffer)
1089  {
1090  if(!setTagsFromBuffer())
1091  {
1092  // Failed to read the tags from the buffer, so cannot
1093  // get tags.
1094  return(false);
1095  }
1096  }
1097 
1098  bool returnStatus = true;
1099 
1100  int rmBuffSize = 0;
1101  while(*currentTagPtr != '\0')
1102  {
1103 
1104  // Tags are formatted as: XY:Z
1105  // Where X is [A-Za-z], Y is [A-Za-z], and
1106  // Z is A,i,f,Z,H (cCsSI are also excepted)
1107  if((currentTagPtr[0] == '\0') || (currentTagPtr[1] == '\0') ||
1108  (currentTagPtr[2] != ':') || (currentTagPtr[3] == '\0'))
1109  {
1110  myStatus.setStatus(SamStatus::INVALID,
1111  "rmTags called with improperly formatted tags.\n");
1112  returnStatus = false;
1113  break;
1114  }
1115 
1116  // Construct the key.
1117  int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1],
1118  currentTagPtr[3]);
1119  // Look to see if the key exsists in the hash.
1120  int offset = extras.Find(key);
1121 
1122  if(offset >= 0)
1123  {
1124  // Offset is set, so the key was found.
1125  // First if it is an integer, determine the actual type of the int.
1126  char vtype;
1127  getTypeFromKey(key, vtype);
1128  if(vtype == 'i')
1129  {
1130  vtype = getIntegerType(offset);
1131  }
1132 
1133  // Offset is set, so recalculate the buffer size without this entry.
1134  // Do NOT remove from strings, integers, or floats because then
1135  // extras would need to be updated for all entries with the new indexes
1136  // into those variables.
1137  switch(vtype)
1138  {
1139  case 'A':
1140  case 'c':
1141  case 'C':
1142  rmBuffSize += 4;
1143  break;
1144  case 's':
1145  case 'S':
1146  rmBuffSize += 5;
1147  break;
1148  case 'i':
1149  case 'I':
1150  rmBuffSize += 7;
1151  break;
1152  case 'f':
1153  rmBuffSize += 7;
1154  break;
1155  case 'Z':
1156  rmBuffSize += 4 + getString(offset).Length();
1157  break;
1158  case 'B':
1159  rmBuffSize += 3 + getBtagBufferSize(getString(offset));
1160  break;
1161  default:
1162  myStatus.setStatus(SamStatus::INVALID,
1163  "rmTag called with unknown type.\n");
1164  returnStatus = false;
1165  break;
1166  };
1167 
1168  // Remove from the hash.
1169  extras.Delete(offset);
1170  }
1171  // Increment to the next tag.
1172  if((currentTagPtr[4] == ';') || (currentTagPtr[4] == ','))
1173  {
1174  // Increment once more.
1175  currentTagPtr += 5;
1176  }
1177  else if(currentTagPtr[4] != '\0')
1178  {
1179  // Invalid tag format.
1180  myStatus.setStatus(SamStatus::INVALID,
1181  "rmTags called with improperly formatted tags.\n");
1182  returnStatus = false;
1183  break;
1184  }
1185  else
1186  {
1187  // Last Tag.
1188  currentTagPtr += 4;
1189  }
1190  }
1191 
1192  // The buffer tags are now out of sync.
1193  myNeedToSetTagsInBuffer = true;
1194  myIsTagsBufferValid = false;
1195  myIsBufferSynced = false;
1196  myTagBufferSize -= rmBuffSize;
1197 
1198 
1199  return(returnStatus);
1200 }
1201 
1202 
1203 // Get methods for record fields.
1205 {
1206  return(getRecordBuffer(mySequenceTranslation));
1207 }
1208 
1209 
1210 // Get methods for record fields.
1212 {
1213  myStatus = SamStatus::SUCCESS;
1214  bool status = true;
1215  // If the buffer is not synced or the sequence in the buffer is not
1216  // properly translated, fix the buffer.
1217  if((myIsBufferSynced == false) ||
1218  (myBufferSequenceTranslation != translation))
1219  {
1220  status &= fixBuffer(translation);
1221  }
1222  // If the buffer is synced, check to see if the tags need to be synced.
1223  if(myNeedToSetTagsInBuffer)
1224  {
1225  status &= setTagsInBuffer();
1226  }
1227  if(!status)
1228  {
1229  return(NULL);
1230  }
1231  return (const void *)myRecordPtr;
1232 }
1233 
1234 
1235 // Write the record as a buffer into the file using the class's
1236 // sequence translation setting.
1238 {
1239  return(writeRecordBuffer(filePtr, mySequenceTranslation));
1240 }
1241 
1242 
1243 // Write the record as a buffer into the file using the specified translation.
1245  SequenceTranslation translation)
1246 {
1247  myStatus = SamStatus::SUCCESS;
1248  if((filePtr == NULL) || (filePtr->isOpen() == false))
1249  {
1250  // File is not open, return failure.
1252  "Can't write to an unopened file.");
1253  return(SamStatus::FAIL_ORDER);
1254  }
1255 
1256  if((myIsBufferSynced == false) ||
1257  (myBufferSequenceTranslation != translation))
1258  {
1259  if(!fixBuffer(translation))
1260  {
1261  return(myStatus.getStatus());
1262  }
1263  }
1264 
1265  // Write the record.
1266  unsigned int numBytesToWrite = myRecordPtr->myBlockSize + sizeof(int32_t);
1267  unsigned int numBytesWritten =
1268  ifwrite(filePtr, myRecordPtr, numBytesToWrite);
1269 
1270  // Return status based on if the correct number of bytes were written.
1271  if(numBytesToWrite == numBytesWritten)
1272  {
1273  return(SamStatus::SUCCESS);
1274  }
1275  // The correct number of bytes were not written.
1276  myStatus.setStatus(SamStatus::FAIL_IO, "Failed to write the entire record.");
1277  return(SamStatus::FAIL_IO);
1278 }
1279 
1280 
1282 {
1283  myStatus = SamStatus::SUCCESS;
1284  // If the buffer isn't synced, sync the buffer to determine the
1285  // block size.
1286  if(myIsBufferSynced == false)
1287  {
1288  // Since this just returns the block size, the translation of
1289  // the sequence does not matter, so just use the currently set
1290  // value.
1291  fixBuffer(myBufferSequenceTranslation);
1292  }
1293  return myRecordPtr->myBlockSize;
1294 }
1295 
1296 
1297 // This method returns the reference name.
1299 {
1300  myStatus = SamStatus::SUCCESS;
1301  return myReferenceName.c_str();
1302 }
1303 
1304 
1306 {
1307  myStatus = SamStatus::SUCCESS;
1308  return myRecordPtr->myReferenceID;
1309 }
1310 
1311 
1313 {
1314  myStatus = SamStatus::SUCCESS;
1315  return (myRecordPtr->myPosition + 1);
1316 }
1317 
1318 
1320 {
1321  myStatus = SamStatus::SUCCESS;
1322  return myRecordPtr->myPosition;
1323 }
1324 
1325 
1327 {
1328  myStatus = SamStatus::SUCCESS;
1329  // If the buffer is valid, return the size from there, otherwise get the
1330  // size from the string length + 1 (ending null).
1331  if(myIsReadNameBufferValid)
1332  {
1333  return(myRecordPtr->myReadNameLength);
1334  }
1335 
1336  return(myReadName.Length() + 1);
1337 }
1338 
1339 
1341 {
1342  myStatus = SamStatus::SUCCESS;
1343  return myRecordPtr->myMapQuality;
1344 }
1345 
1346 
1348 {
1349  myStatus = SamStatus::SUCCESS;
1350  if(!myIsBinValid)
1351  {
1352  // The bin that is set in the record is not valid, so
1353  // reset it.
1354  myRecordPtr->myBin =
1355  bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd());
1356  myIsBinValid = true;
1357  }
1358  return(myRecordPtr->myBin);
1359 }
1360 
1361 
1363 {
1364  myStatus = SamStatus::SUCCESS;
1365  // If the cigar buffer is valid
1366  // then get the length from there.
1367  if(myIsCigarBufferValid)
1368  {
1369  return myRecordPtr->myCigarLength;
1370  }
1371 
1372  if(myCigarTempBufferLength == -1)
1373  {
1374  // The cigar buffer is not valid and the cigar temp buffer is not set,
1375  // so parse the string.
1376  parseCigarString();
1377  }
1378 
1379  // The temp buffer is now set, so return the size.
1380  return(myCigarTempBufferLength);
1381 }
1382 
1383 
1385 {
1386  myStatus = SamStatus::SUCCESS;
1387  return myRecordPtr->myFlag;
1388 }
1389 
1390 
1392 {
1393  myStatus = SamStatus::SUCCESS;
1394  if(myIsSequenceBufferValid == false)
1395  {
1396  // If the sequence is "*", then return 0.
1397  if((mySequence.Length() == 1) && (mySequence[0] == '*'))
1398  {
1399  return(0);
1400  }
1401  // Do not add 1 since it is not null terminated.
1402  return(mySequence.Length());
1403  }
1404  return(myRecordPtr->myReadLength);
1405 }
1406 
1407 
1408 // This method returns the mate reference name. If it is equal to the
1409 // reference name, it still returns the reference name.
1411 {
1412  myStatus = SamStatus::SUCCESS;
1413  return myMateReferenceName.c_str();
1414 }
1415 
1416 
1417 // This method returns the mate reference name. If it is equal to the
1418 // reference name, it returns "=", unless they are both "*" in which case
1419 // "*" is returned.
1421 {
1422  myStatus = SamStatus::SUCCESS;
1423  if(myMateReferenceName == "*")
1424  {
1425  return(myMateReferenceName);
1426  }
1427  if(myMateReferenceName == getReferenceName())
1428  {
1429  return(FIELD_ABSENT_STRING);
1430  }
1431  else
1432  {
1433  return(myMateReferenceName);
1434  }
1435 }
1436 
1437 
1439 {
1440  myStatus = SamStatus::SUCCESS;
1441  return myRecordPtr->myMateReferenceID;
1442 }
1443 
1444 
1446 {
1447  myStatus = SamStatus::SUCCESS;
1448  return (myRecordPtr->myMatePosition + 1);
1449 }
1450 
1451 
1453 {
1454  myStatus = SamStatus::SUCCESS;
1455  return myRecordPtr->myMatePosition;
1456 }
1457 
1458 
1460 {
1461  myStatus = SamStatus::SUCCESS;
1462  return myRecordPtr->myInsertSize;
1463 }
1464 
1465 
1466 // Returns the inclusive rightmost position of the clipped sequence.
1468 {
1469  myStatus = SamStatus::SUCCESS;
1470  if(myAlignmentLength == -1)
1471  {
1472  // Alignment end has not been set, so calculate it.
1473  parseCigar();
1474  }
1475  // If alignment length > 0, subtract 1 from it to get the end.
1476  if(myAlignmentLength == 0)
1477  {
1478  // Length is 0, just return the start position.
1479  return(myRecordPtr->myPosition);
1480  }
1481  return(myRecordPtr->myPosition + myAlignmentLength - 1);
1482 }
1483 
1484 
1485 // Returns the inclusive rightmost position of the clipped sequence.
1487 {
1488  return(get0BasedAlignmentEnd() + 1);
1489 }
1490 
1491 
1492 // Return the length of the alignment.
1494 {
1495  myStatus = SamStatus::SUCCESS;
1496  if(myAlignmentLength == -1)
1497  {
1498  // Alignment end has not been set, so calculate it.
1499  parseCigar();
1500  }
1501  // Return the alignment length.
1502  return(myAlignmentLength);
1503 }
1504 
1505 // Returns the inclusive left-most position adjust for clipped bases.
1507 {
1508  myStatus = SamStatus::SUCCESS;
1509  if(myUnclippedStartOffset == -1)
1510  {
1511  // Unclipped has not yet been calculated, so parse the cigar to get it
1512  parseCigar();
1513  }
1514  return(myRecordPtr->myPosition - myUnclippedStartOffset);
1515 }
1516 
1517 
1518 // Returns the inclusive left-most position adjust for clipped bases.
1520 {
1521  return(get0BasedUnclippedStart() + 1);
1522 }
1523 
1524 
1525 // Returns the inclusive right-most position adjust for clipped bases.
1527 {
1528  // myUnclippedEndOffset will be set by get0BasedAlignmentEnd if the
1529  // cigar has not yet been parsed, so no need to check it here.
1530  return(get0BasedAlignmentEnd() + myUnclippedEndOffset);
1531 }
1532 
1533 
1534 // Returns the inclusive right-most position adjust for clipped bases.
1536 {
1537  return(get0BasedUnclippedEnd() + 1);
1538 }
1539 
1540 
1541 // Get the read name.
1543 {
1544  myStatus = SamStatus::SUCCESS;
1545  if(myReadName.Length() == 0)
1546  {
1547  // 0 Length, means that it is in the buffer, but has not yet
1548  // been synced to the string, so do the sync.
1549  myReadName = (char*)&(myRecordPtr->myData);
1550  }
1551  return myReadName.c_str();
1552 }
1553 
1554 
1555 const char* SamRecord::getCigar()
1556 {
1557  myStatus = SamStatus::SUCCESS;
1558  if(myCigar.Length() == 0)
1559  {
1560  // 0 Length, means that it is in the buffer, but has not yet
1561  // been synced to the string, so do the sync.
1562  parseCigarBinary();
1563  }
1564  return myCigar.c_str();
1565 }
1566 
1567 
1569 {
1570  return(getSequence(mySequenceTranslation));
1571 }
1572 
1573 
1575 {
1576  myStatus = SamStatus::SUCCESS;
1577  if(mySequence.Length() == 0)
1578  {
1579  // 0 Length, means that it is in the buffer, but has not yet
1580  // been synced to the string, so do the sync.
1581  setSequenceAndQualityFromBuffer();
1582  }
1583 
1584  // Determine if translation needs to be done.
1585  if((translation == NONE) || (myRefPtr == NULL))
1586  {
1587  return mySequence.c_str();
1588  }
1589  else if(translation == EQUAL)
1590  {
1591  if(mySeqWithEq.length() == 0)
1592  {
1593  // Check to see if the sequence is defined.
1594  if(mySequence == "*")
1595  {
1596  // Sequence is undefined, so no translation necessary.
1597  mySeqWithEq = '*';
1598  }
1599  else
1600  {
1601  // Sequence defined, so translate it.
1602  SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(),
1603  myRecordPtr->myPosition,
1604  *(getCigarInfo()),
1605  getReferenceName(),
1606  *myRefPtr,
1607  mySeqWithEq);
1608  }
1609  }
1610  return(mySeqWithEq.c_str());
1611  }
1612  else
1613  {
1614  // translation == BASES
1615  if(mySeqWithoutEq.length() == 0)
1616  {
1617  if(mySequence == "*")
1618  {
1619  // Sequence is undefined, so no translation necessary.
1620  mySeqWithoutEq = '*';
1621  }
1622  else
1623  {
1624  // Sequence defined, so translate it.
1625  SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(),
1626  myRecordPtr->myPosition,
1627  *(getCigarInfo()),
1628  getReferenceName(),
1629  *myRefPtr,
1630  mySeqWithoutEq);
1631  }
1632  }
1633  return(mySeqWithoutEq.c_str());
1634  }
1635 }
1636 
1637 
1639 {
1640  myStatus = SamStatus::SUCCESS;
1641  if(myQuality.Length() == 0)
1642  {
1643  // 0 Length, means that it is in the buffer, but has not yet
1644  // been synced to the string, so do the sync.
1645  setSequenceAndQualityFromBuffer();
1646  }
1647  return myQuality.c_str();
1648 }
1649 
1650 
1651 char SamRecord::getSequence(int index)
1652 {
1653  return(getSequence(index, mySequenceTranslation));
1654 }
1655 
1656 
1657 char SamRecord::getSequence(int index, SequenceTranslation translation)
1658 {
1659  static const char * asciiBases = "=AC.G...T......N";
1660 
1661  // Determine the read length.
1662  int32_t readLen = getReadLength();
1663 
1664  // If the read length is 0, this method should not be called.
1665  if(readLen == 0)
1666  {
1667  String exceptionString = "SamRecord::getSequence(";
1668  exceptionString += index;
1669  exceptionString += ") is not allowed since sequence = '*'";
1670  throw std::runtime_error(exceptionString.c_str());
1671  }
1672  else if((index < 0) || (index >= readLen))
1673  {
1674  // Only get here if the index was out of range, so thow an exception.
1675  String exceptionString = "SamRecord::getSequence(";
1676  exceptionString += index;
1677  exceptionString += ") is out of range. Index must be between 0 and ";
1678  exceptionString += (readLen - 1);
1679  throw std::runtime_error(exceptionString.c_str());
1680  }
1681 
1682  // Determine if translation needs to be done.
1683  if((translation == NONE) || (myRefPtr == NULL))
1684  {
1685  // No translation needs to be done.
1686  if(mySequence.Length() == 0)
1687  {
1688  // Parse BAM sequence.
1689  if(myIsSequenceBufferValid)
1690  {
1691  return(index & 1 ?
1692  asciiBases[myPackedSequence[index / 2] & 0xF] :
1693  asciiBases[myPackedSequence[index / 2] >> 4]);
1694  }
1695  else
1696  {
1697  String exceptionString = "SamRecord::getSequence(";
1698  exceptionString += index;
1699  exceptionString += ") called with no sequence set";
1700  throw std::runtime_error(exceptionString.c_str());
1701  }
1702  }
1703  // Already have string.
1704  return(mySequence[index]);
1705  }
1706  else
1707  {
1708  // Need to translate the sequence either to have '=' or to not
1709  // have it.
1710  // First check to see if the sequence has been set.
1711  if(mySequence.Length() == 0)
1712  {
1713  // 0 Length, means that it is in the buffer, but has not yet
1714  // been synced to the string, so do the sync.
1715  setSequenceAndQualityFromBuffer();
1716  }
1717 
1718  // Check the type of translation.
1719  if(translation == EQUAL)
1720  {
1721  // Check whether or not the string has already been
1722  // retrieved that has the '=' in it.
1723  if(mySeqWithEq.length() == 0)
1724  {
1725  // The string with '=' has not yet been determined,
1726  // so get the string.
1727  // Check to see if the sequence is defined.
1728  if(mySequence == "*")
1729  {
1730  // Sequence is undefined, so no translation necessary.
1731  mySeqWithEq = '*';
1732  }
1733  else
1734  {
1735  // Sequence defined, so translate it.
1736  SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(),
1737  myRecordPtr->myPosition,
1738  *(getCigarInfo()),
1739  getReferenceName(),
1740  *myRefPtr,
1741  mySeqWithEq);
1742  }
1743  }
1744  // Sequence is set, so return it.
1745  return(mySeqWithEq[index]);
1746  }
1747  else
1748  {
1749  // translation == BASES
1750  // Check whether or not the string has already been
1751  // retrieved that does not have the '=' in it.
1752  if(mySeqWithoutEq.length() == 0)
1753  {
1754  // The string with '=' has not yet been determined,
1755  // so get the string.
1756  // Check to see if the sequence is defined.
1757  if(mySequence == "*")
1758  {
1759  // Sequence is undefined, so no translation necessary.
1760  mySeqWithoutEq = '*';
1761  }
1762  else
1763  {
1764  // Sequence defined, so translate it.
1765  // The string without '=' has not yet been determined,
1766  // so get the string.
1767  SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(),
1768  myRecordPtr->myPosition,
1769  *(getCigarInfo()),
1770  getReferenceName(),
1771  *myRefPtr,
1772  mySeqWithoutEq);
1773  }
1774  }
1775  // Sequence is set, so return it.
1776  return(mySeqWithoutEq[index]);
1777  }
1778  }
1779 }
1780 
1781 
1782 char SamRecord::getQuality(int index)
1783 {
1784  // Determine the read length.
1785  int32_t readLen = getReadLength();
1786 
1787  // If the read length is 0, return ' ' whose ascii code is below
1788  // the minimum ascii code for qualities.
1789  if(readLen == 0)
1790  {
1792  }
1793  else if((index < 0) || (index >= readLen))
1794  {
1795  // Only get here if the index was out of range, so thow an exception.
1796  String exceptionString = "SamRecord::getQuality(";
1797  exceptionString += index;
1798  exceptionString += ") is out of range. Index must be between 0 and ";
1799  exceptionString += (readLen - 1);
1800  throw std::runtime_error(exceptionString.c_str());
1801  }
1802 
1803  if(myQuality.Length() == 0)
1804  {
1805  // Parse BAM Quality.
1806  // Know that myPackedQuality is correct since readLen != 0.
1807  return(myPackedQuality[index] + 33);
1808  }
1809  else
1810  {
1811  // Already have string.
1812  if((myQuality.Length() == 1) && (myQuality[0] == '*'))
1813  {
1814  // Return the unknown quality character.
1816  }
1817  else if(index >= myQuality.Length())
1818  {
1819  // Only get here if the index was out of range, so thow an exception.
1820  // Technically the myQuality string is not guaranteed to be the same length
1821  // as the sequence, so this catches that error.
1822  String exceptionString = "SamRecord::getQuality(";
1823  exceptionString += index;
1824  exceptionString += ") is out of range. Index must be between 0 and ";
1825  exceptionString += (myQuality.Length() - 1);
1826  throw std::runtime_error(exceptionString.c_str());
1827  }
1828  else
1829  {
1830  return(myQuality[index]);
1831  }
1832  }
1833 }
1834 
1835 
1837 {
1838  // Check to see whether or not the Cigar has already been
1839  // set - this is determined by checking if alignment length
1840  // is set since alignment length and the cigar are set
1841  // at the same time.
1842  if(myAlignmentLength == -1)
1843  {
1844  // Not been set, so calculate it.
1845  parseCigar();
1846  }
1847  return(&myCigarRoller);
1848 }
1849 
1850 
1851 // Return the number of bases in this read that overlap the passed in
1852 // region. (start & end are 0-based)
1853 uint32_t SamRecord::getNumOverlaps(int32_t start, int32_t end)
1854 {
1855  // Determine whether or not the cigar has been parsed, which sets up
1856  // the cigar roller. This is determined by checking the alignment length.
1857  if(myAlignmentLength == -1)
1858  {
1859  parseCigar();
1860  }
1861  return(myCigarRoller.getNumOverlaps(start, end, get0BasedPosition()));
1862 }
1863 
1864 
1865 // Returns the values of all fields except the tags.
1866 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName,
1867  String& cigar, String& sequence, String& quality)
1868 {
1869  return(getFields(recStruct, readName, cigar, sequence, quality,
1870  mySequenceTranslation));
1871 }
1872 
1873 
1874 // Returns the values of all fields except the tags.
1875 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName,
1876  String& cigar, String& sequence, String& quality,
1877  SequenceTranslation translation)
1878 {
1879  myStatus = SamStatus::SUCCESS;
1880  if(myIsBufferSynced == false)
1881  {
1882  if(!fixBuffer(translation))
1883  {
1884  // failed to set the buffer, return false.
1885  return(false);
1886  }
1887  }
1888  memcpy(&recStruct, myRecordPtr, sizeof(bamRecordStruct));
1889 
1890  readName = getReadName();
1891  // Check the status.
1892  if(myStatus != SamStatus::SUCCESS)
1893  {
1894  // Failed to set the fields, return false.
1895  return(false);
1896  }
1897  cigar = getCigar();
1898  // Check the status.
1899  if(myStatus != SamStatus::SUCCESS)
1900  {
1901  // Failed to set the fields, return false.
1902  return(false);
1903  }
1904  sequence = getSequence(translation);
1905  // Check the status.
1906  if(myStatus != SamStatus::SUCCESS)
1907  {
1908  // Failed to set the fields, return false.
1909  return(false);
1910  }
1911  quality = getQuality();
1912  // Check the status.
1913  if(myStatus != SamStatus::SUCCESS)
1914  {
1915  // Failed to set the fields, return false.
1916  return(false);
1917  }
1918  return(true);
1919 }
1920 
1921 
1922 // Returns the reference pointer.
1924 {
1925  return(myRefPtr);
1926 }
1927 
1928 
1930 {
1931  myStatus = SamStatus::SUCCESS;
1932  if(myNeedToSetTagsFromBuffer)
1933  {
1934  // Tags are only set in the buffer, so the size of the tags is
1935  // the length of the record minus the starting location of the tags.
1936  unsigned char * tagStart =
1937  (unsigned char *)myRecordPtr->myData
1938  + myRecordPtr->myReadNameLength
1939  + myRecordPtr->myCigarLength * sizeof(int)
1940  + (myRecordPtr->myReadLength + 1) / 2 + myRecordPtr->myReadLength;
1941 
1942  // The non-tags take up from the start of the record to the tag start.
1943  // Do not include the block size part of the record since it is not
1944  // included in the size.
1945  uint32_t nonTagSize =
1946  tagStart - (unsigned char*)&(myRecordPtr->myReferenceID);
1947  // Tags take up the size of the block minus the non-tag section.
1948  uint32_t tagSize = myRecordPtr->myBlockSize - nonTagSize;
1949  return(tagSize);
1950  }
1951 
1952  // Tags are stored outside the buffer, so myTagBufferSize is set.
1953  return(myTagBufferSize);
1954 }
1955 
1956 
1957 // Returns true if there is another tag and sets tag and vtype to the
1958 // appropriate values, and returns a pointer to the value.
1959 // Sets the Status to SUCCESS when a tag is successfully returned or
1960 // when there are no more tags. Otherwise the status is set to describe
1961 // why it failed (parsing, etc).
1962 bool SamRecord::getNextSamTag(char* tag, char& vtype, void** value)
1963 {
1964  myStatus = SamStatus::SUCCESS;
1965  if(myNeedToSetTagsFromBuffer)
1966  {
1967  if(!setTagsFromBuffer())
1968  {
1969  // Failed to read the tags from the buffer, so cannot
1970  // get tags.
1971  return(false);
1972  }
1973  }
1974 
1975  // Increment the tag index to start looking at the next tag.
1976  // At the beginning, it is set to -1.
1977  myLastTagIndex++;
1978  int maxTagIndex = extras.Capacity();
1979  if(myLastTagIndex >= maxTagIndex)
1980  {
1981  // Hit the end of the tags, return false, no more tags.
1982  // Status is still success since this is not an error,
1983  // it is just the end of the list.
1984  return(false);
1985  }
1986 
1987  bool tagFound = false;
1988  // Loop until a tag is found or the end of extras is hit.
1989  while((tagFound == false) && (myLastTagIndex < maxTagIndex))
1990  {
1991  if(extras.SlotInUse(myLastTagIndex))
1992  {
1993  // Found a slot to use.
1994  int key = extras.GetKey(myLastTagIndex);
1995  getTag(key, tag);
1996  getTypeFromKey(key, vtype);
1997  tagFound = true;
1998  // Get the value associated with the key based on the vtype.
1999  switch (vtype)
2000  {
2001  case 'f' :
2002  *value = getFloatPtr(myLastTagIndex);
2003  break;
2004  case 'i' :
2005  *value = getIntegerPtr(myLastTagIndex, vtype);
2006  if(vtype != 'A')
2007  {
2008  // Convert all int types to 'i'
2009  vtype = 'i';
2010  }
2011  break;
2012  case 'Z' :
2013  case 'B' :
2014  *value = getStringPtr(myLastTagIndex);
2015  break;
2016  default:
2018  "Unknown tag type");
2019  tagFound = false;
2020  break;
2021  }
2022  }
2023  if(!tagFound)
2024  {
2025  // Increment the index since a tag was not found.
2026  myLastTagIndex++;
2027  }
2028  }
2029  return(tagFound);
2030 }
2031 
2032 
2033 // Reset the tag iterator to the beginning of the tags.
2035 {
2036  myLastTagIndex = -1;
2037 }
2038 
2039 
2041 {
2042  if((vtype == 'c') || (vtype == 'C') ||
2043  (vtype == 's') || (vtype == 'S') ||
2044  (vtype == 'i') || (vtype == 'I'))
2045  {
2046  return(true);
2047  }
2048  return(false);
2049 }
2050 
2051 
2052 bool SamRecord::isFloatType(char vtype)
2053 {
2054  if(vtype == 'f')
2055  {
2056  return(true);
2057  }
2058  return(false);
2059 }
2060 
2061 
2062 bool SamRecord::isCharType(char vtype)
2063 {
2064  if(vtype == 'A')
2065  {
2066  return(true);
2067  }
2068  return(false);
2069 }
2070 
2071 
2072 bool SamRecord::isStringType(char vtype)
2073 {
2074  if((vtype == 'Z') || (vtype == 'B'))
2075  {
2076  return(true);
2077  }
2078  return(false);
2079 }
2080 
2081 
2082 bool SamRecord::getTagsString(const char* tags, String& returnString, char delim)
2083 {
2084  const char* currentTagPtr = tags;
2085 
2086  returnString.Clear();
2087  myStatus = SamStatus::SUCCESS;
2088  if(myNeedToSetTagsFromBuffer)
2089  {
2090  if(!setTagsFromBuffer())
2091  {
2092  // Failed to read the tags from the buffer, so cannot
2093  // get tags.
2094  return(false);
2095  }
2096  }
2097 
2098  bool returnStatus = true;
2099 
2100  while(*currentTagPtr != '\0')
2101  {
2102  // Tags are formatted as: XY:Z
2103  // Where X is [A-Za-z], Y is [A-Za-z], and
2104  // Z is A,i,f,Z,H (cCsSI are also excepted)
2105  if((currentTagPtr[0] == '\0') || (currentTagPtr[1] == '\0') ||
2106  (currentTagPtr[2] != ':') || (currentTagPtr[3] == '\0'))
2107  {
2108  myStatus.setStatus(SamStatus::INVALID,
2109  "getTagsString called with improperly formatted tags.\n");
2110  returnStatus = false;
2111  break;
2112  }
2113 
2114  // Construct the key.
2115  int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1],
2116  currentTagPtr[3]);
2117  // Look to see if the key exsists in the hash.
2118  int offset = extras.Find(key);
2119 
2120  if(offset >= 0)
2121  {
2122  // Offset is set, so the key was found.
2123  if(!returnString.IsEmpty())
2124  {
2125  returnString += delim;
2126  }
2127  returnString += currentTagPtr[0];
2128  returnString += currentTagPtr[1];
2129  returnString += ':';
2130  returnString += currentTagPtr[3];
2131  returnString += ':';
2132 
2133  // First if it is an integer, determine the actual type of the int.
2134  char vtype;
2135  getTypeFromKey(key, vtype);
2136 
2137  switch(vtype)
2138  {
2139  case 'i':
2140  returnString += *(int*)getIntegerPtr(offset, vtype);
2141  break;
2142  case 'f':
2143  returnString += *(float*)getFloatPtr(offset);
2144  break;
2145  case 'Z':
2146  case 'B':
2147  returnString += *(String*)getStringPtr(offset);
2148  break;
2149  default:
2150  myStatus.setStatus(SamStatus::INVALID,
2151  "rmTag called with unknown type.\n");
2152  returnStatus = false;
2153  break;
2154  };
2155  }
2156  // Increment to the next tag.
2157  if((currentTagPtr[4] == ';') || (currentTagPtr[4] == ','))
2158  {
2159  // Increment once more.
2160  currentTagPtr += 5;
2161  }
2162  else if(currentTagPtr[4] != '\0')
2163  {
2164  // Invalid tag format.
2165  myStatus.setStatus(SamStatus::INVALID,
2166  "rmTags called with improperly formatted tags.\n");
2167  returnStatus = false;
2168  break;
2169  }
2170  else
2171  {
2172  // Last Tag.
2173  currentTagPtr += 4;
2174  }
2175  }
2176  return(returnStatus);
2177 }
2178 
2179 
2180 const String* SamRecord::getStringTag(const char * tag)
2181 {
2182  // Parse the buffer if necessary.
2183  if(myNeedToSetTagsFromBuffer)
2184  {
2185  if(!setTagsFromBuffer())
2186  {
2187  // Failed to read the tags from the buffer, so cannot
2188  // get tags. setTagsFromBuffer set the errors,
2189  // so just return null.
2190  return(NULL);
2191  }
2192  }
2193 
2194  int key = MAKEKEY(tag[0], tag[1], 'Z');
2195  int offset = extras.Find(key);
2196 
2197  int value;
2198  if (offset < 0)
2199  {
2200  // Check for 'B' tag.
2201  key = MAKEKEY(tag[0], tag[1], 'B');
2202  offset = extras.Find(key);
2203  if(offset < 0)
2204  {
2205  // Tag not found.
2206  return(NULL);
2207  }
2208  }
2209 
2210  // Offset is valid, so return the tag.
2211  value = extras[offset];
2212  return(&(strings[value]));
2213 }
2214 
2215 
2216 int* SamRecord::getIntegerTag(const char * tag)
2217 {
2218  // Init to success.
2219  myStatus = SamStatus::SUCCESS;
2220  // Parse the buffer if necessary.
2221  if(myNeedToSetTagsFromBuffer)
2222  {
2223  if(!setTagsFromBuffer())
2224  {
2225  // Failed to read the tags from the buffer, so cannot
2226  // get tags. setTagsFromBuffer set the errors,
2227  // so just return NULL.
2228  return(NULL);
2229  }
2230  }
2231 
2232  int key = MAKEKEY(tag[0], tag[1], 'i');
2233  int offset = extras.Find(key);
2234 
2235  int value;
2236  if (offset < 0)
2237  {
2238  // Failed to find the tag.
2239  return(NULL);
2240  }
2241  else
2242  value = extras[offset];
2243 
2244  return(&(integers[value]));
2245 }
2246 
2247 
2248 bool SamRecord::getIntegerTag(const char * tag, int& tagVal)
2249 {
2250  // Init to success.
2251  myStatus = SamStatus::SUCCESS;
2252  // Parse the buffer if necessary.
2253  if(myNeedToSetTagsFromBuffer)
2254  {
2255  if(!setTagsFromBuffer())
2256  {
2257  // Failed to read the tags from the buffer, so cannot
2258  // get tags. setTagsFromBuffer set the errors,
2259  // so just return false.
2260  return(false);
2261  }
2262  }
2263 
2264  int key = MAKEKEY(tag[0], tag[1], 'i');
2265  int offset = extras.Find(key);
2266 
2267  int value;
2268  if (offset < 0)
2269  {
2270  // Failed to find the tag.
2271  return(false);
2272  }
2273  else
2274  value = extras[offset];
2275 
2276  tagVal = integers[value];
2277  return(true);
2278 }
2279 
2280 
2281 bool SamRecord::getFloatTag(const char * tag, float& tagVal)
2282 {
2283  // Init to success.
2284  myStatus = SamStatus::SUCCESS;
2285  // Parse the buffer if necessary.
2286  if(myNeedToSetTagsFromBuffer)
2287  {
2288  if(!setTagsFromBuffer())
2289  {
2290  // Failed to read the tags from the buffer, so cannot
2291  // get tags. setTagsFromBuffer set the errors,
2292  // so just return false.
2293  return(false);
2294  }
2295  }
2296 
2297  int key = MAKEKEY(tag[0], tag[1], 'f');
2298  int offset = extras.Find(key);
2299 
2300  int value;
2301  if (offset < 0)
2302  {
2303  // Failed to find the tag.
2304  return(false);
2305  }
2306  else
2307  value = extras[offset];
2308 
2309  tagVal = floats[value];
2310  return(true);
2311 }
2312 
2313 
2314 const String & SamRecord::getString(const char * tag)
2315 {
2316  // Init to success.
2317  myStatus = SamStatus::SUCCESS;
2318  // Parse the buffer if necessary.
2319  if(myNeedToSetTagsFromBuffer)
2320  {
2321  if(!setTagsFromBuffer())
2322  {
2323  // Failed to read the tags from the buffer, so cannot
2324  // get tags.
2325  // TODO - what do we want to do on failure?
2326  }
2327  }
2328 
2329  int key = MAKEKEY(tag[0], tag[1], 'Z');
2330  int offset = extras.Find(key);
2331 
2332  int value;
2333  if (offset < 0)
2334  {
2335 
2336  key = MAKEKEY(tag[0], tag[1], 'B');
2337  offset = extras.Find(key);
2338  if (offset < 0)
2339  {
2340  // TODO - what do we want to do on failure?
2341  return(NOT_FOUND_TAG_STRING);
2342  }
2343  }
2344  value = extras[offset];
2345 
2346  return strings[value];
2347 }
2348 
2349 
2350 int & SamRecord::getInteger(const char * tag)
2351 {
2352  // Init to success.
2353  myStatus = SamStatus::SUCCESS;
2354  // Parse the buffer if necessary.
2355  if(myNeedToSetTagsFromBuffer)
2356  {
2357  if(!setTagsFromBuffer())
2358  {
2359  // Failed to read the tags from the buffer, so cannot
2360  // get tags. setTagsFromBuffer set the error.
2361  // TODO - what do we want to do on failure?
2362  }
2363  }
2364 
2365  int key = MAKEKEY(tag[0], tag[1], 'i');
2366  int offset = extras.Find(key);
2367 
2368  int value;
2369  if (offset < 0)
2370  {
2371  // TODO - what do we want to do on failure?
2372  return NOT_FOUND_TAG_INT;
2373  }
2374  else
2375  value = extras[offset];
2376 
2377  return integers[value];
2378 }
2379 
2380 
2381 bool SamRecord::checkTag(const char * tag, char type)
2382 {
2383  // Init to success.
2384  myStatus = SamStatus::SUCCESS;
2385  // Parse the buffer if necessary.
2386  if(myNeedToSetTagsFromBuffer)
2387  {
2388  if(!setTagsFromBuffer())
2389  {
2390  // Failed to read the tags from the buffer, so cannot
2391  // get tags. setTagsFromBuffer set the error.
2392  return("");
2393  }
2394  }
2395 
2396  int key = MAKEKEY(tag[0], tag[1], type);
2397 
2398  return (extras.Find(key) != LH_NOTFOUND);
2399 }
2400 
2401 
2402 // Return the error after a failed SamRecord call.
2404 {
2405  return(myStatus);
2406 }
2407 
2408 
2409 // Allocate space for the record - does a realloc.
2410 // The passed in size is the size of the entire record including the
2411 // block size field.
2412 bool SamRecord::allocateRecordStructure(int size)
2413 {
2414  if (allocatedSize < size)
2415  {
2416  bamRecordStruct* tmpRecordPtr =
2417  (bamRecordStruct *)realloc(myRecordPtr, size);
2418  if(tmpRecordPtr == NULL)
2419  {
2420  // FAILED to allocate memory
2421  fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
2422  myStatus.addError(SamStatus::FAIL_MEM, "Failed Memory Allocation.");
2423  return(false);
2424  }
2425  // Successfully allocated memory, so set myRecordPtr.
2426  myRecordPtr = tmpRecordPtr;
2427 
2428  // Reset the pointers into the record.
2429  if(myIsSequenceBufferValid)
2430  {
2431  myPackedSequence = (unsigned char *)myRecordPtr->myData +
2432  myRecordPtr->myReadNameLength +
2433  myRecordPtr->myCigarLength * sizeof(int);
2434  }
2435  if(myIsQualityBufferValid)
2436  {
2437  myPackedQuality = (unsigned char *)myRecordPtr->myData +
2438  myRecordPtr->myReadNameLength +
2439  myRecordPtr->myCigarLength * sizeof(int) +
2440  (myRecordPtr->myReadLength + 1) / 2;
2441  }
2442 
2443  allocatedSize = size;
2444  }
2445  return(true);
2446 }
2447 
2448 
2449 // Index is the index into the strings array.
2450 void* SamRecord::getStringPtr(int index)
2451 {
2452  int value = extras[index];
2453 
2454  return &(strings[value]);
2455 }
2456 
2457 void* SamRecord::getIntegerPtr(int offset, char& type)
2458 {
2459  int value = extras[offset];
2460 
2461  type = intType[value];
2462 
2463  return &(integers[value]);
2464 }
2465 
2466 void* SamRecord::getFloatPtr(int offset)
2467 {
2468  int value = extras[offset];
2469 
2470  return &(floats[value]);
2471 }
2472 
2473 
2474 // Fixes the buffer to match the variable length fields if they are set.
2475 bool SamRecord::fixBuffer(SequenceTranslation translation)
2476 {
2477  // Check to see if the buffer is already synced.
2478  if(myIsBufferSynced &&
2479  (myBufferSequenceTranslation == translation))
2480  {
2481  // Already synced, nothing to do.
2482  return(true);
2483  }
2484 
2485  // Set the bin if necessary.
2486  if(!myIsBinValid)
2487  {
2488  // The bin that is set in the record is not valid, so
2489  // reset it.
2490  myRecordPtr->myBin =
2491  bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd());
2492  myIsBinValid = true;
2493  }
2494 
2495  // Not synced.
2496  bool status = true;
2497 
2498  // First determine the size the buffer needs to be.
2499  uint8_t newReadNameLen = getReadNameLength();
2500  uint16_t newCigarLen = getCigarLength();
2501  int32_t newReadLen = getReadLength();
2502  uint32_t newTagLen = getTagLength();
2503  uint32_t bamSequenceLen = (newReadLen+1)/2;
2504 
2505  // The buffer size extends from the start of the record to data
2506  // plus the length of the variable fields,
2507  // Multiply the cigar length by 4 since it is the number of uint32_t fields.
2508  int newBufferSize =
2509  ((unsigned char*)(&(myRecordPtr->myData)) -
2510  (unsigned char*)myRecordPtr) +
2511  newReadNameLen + ((newCigarLen)*4) +
2512  newReadLen + bamSequenceLen + newTagLen;
2513 
2514  if(!allocateRecordStructure(newBufferSize))
2515  {
2516  // Failed to allocate space.
2517  return(false);
2518  }
2519 
2520  // Now that space has been added to the buffer, check to see what if
2521  // any fields need to be extracted from the buffer prior to starting to
2522  // overwrite it. Fields need to be extracted from the buffer if the
2523  // buffer is valid for the field and a previous variable length field has
2524  // changed length.
2525  bool readNameLenChange = (newReadNameLen != myRecordPtr->myReadNameLength);
2526  bool cigarLenChange = (newCigarLen != myRecordPtr->myCigarLength);
2527  bool readLenChange = (newReadLen != myRecordPtr->myReadLength);
2528 
2529  // If the tags are still stored in the buffer and any other fields changed
2530  // lengths, they need to be extracted.
2531  if(myIsTagsBufferValid &&
2532  (readNameLenChange | cigarLenChange | readLenChange))
2533  {
2534  status &= setTagsFromBuffer();
2535  // The tag buffer will not be valid once the other fields
2536  // are written, so set it to not valid.
2537  myIsTagsBufferValid = false;
2538  }
2539 
2540  // If the sequence or quality strings are still stored in the buffer, and
2541  // any of the previous fields have changed length, extract it from the
2542  // current buffer.
2543  if((myIsQualityBufferValid | myIsSequenceBufferValid) &&
2544  (readNameLenChange | cigarLenChange | readLenChange))
2545  {
2546  setSequenceAndQualityFromBuffer();
2547  // The quality and sequence buffers will not be valid once the other
2548  // fields are written, so set them to not valid.
2549  myIsQualityBufferValid = false;
2550  myIsSequenceBufferValid = false;
2551  }
2552 
2553  // If the cigar is still stored in the buffer, and any of the
2554  // previous fields have changed length, extract it from the current buffer.
2555  if((myIsCigarBufferValid) &&
2556  (readNameLenChange))
2557  {
2558  status &= parseCigarBinary();
2559  myIsCigarBufferValid = false;
2560  }
2561 
2562  // Set each value in the buffer if it is not already valid.
2563  if(!myIsReadNameBufferValid)
2564  {
2565  memcpy(&(myRecordPtr->myData), myReadName.c_str(),
2566  newReadNameLen);
2567 
2568  // Set the new ReadNameLength.
2569  myRecordPtr->myReadNameLength = newReadNameLen;
2570  myIsReadNameBufferValid = true;
2571  }
2572 
2573  unsigned char * readNameEnds = (unsigned char*)(&(myRecordPtr->myData)) +
2574  myRecordPtr->myReadNameLength;
2575 
2576  // Set the Cigar. Need to reformat from the string to
2577  unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds;
2578 
2579  if(!myIsCigarBufferValid)
2580  {
2581  // The cigar was already parsed when it was set, so just copy
2582  // data from the temporary buffer.
2583  myRecordPtr->myCigarLength = newCigarLen;
2584  memcpy(packedCigar, myCigarTempBuffer,
2585  myRecordPtr->myCigarLength * sizeof(uint32_t));
2586 
2587  myIsCigarBufferValid = true;
2588  }
2589 
2590  unsigned char * packedSequence = readNameEnds +
2591  myRecordPtr->myCigarLength * sizeof(int);
2592  unsigned char * packedQuality = packedSequence + bamSequenceLen;
2593 
2594  if(!myIsSequenceBufferValid || !myIsQualityBufferValid ||
2595  (myBufferSequenceTranslation != translation))
2596  {
2597  myRecordPtr->myReadLength = newReadLen;
2598  // Determine if the quality needs to be set and is just a * and needs to
2599  // be set to 0xFF.
2600  bool noQuality = false;
2601  if((myQuality.Length() == 1) && (myQuality[0] == '*'))
2602  {
2603  noQuality = true;
2604  }
2605 
2606  const char* translatedSeq = NULL;
2607  // If the sequence is not valid in the buffer or it is not
2608  // properly translated, get the properly translated sequence
2609  // that needs to be put into the buffer.
2610  if((!myIsSequenceBufferValid) ||
2611  (translation != myBufferSequenceTranslation))
2612  {
2613  translatedSeq = getSequence(translation);
2614  }
2615 
2616  for (int i = 0; i < myRecordPtr->myReadLength; i++)
2617  {
2618  if((!myIsSequenceBufferValid) ||
2619  (translation != myBufferSequenceTranslation))
2620  {
2621  // Sequence buffer is not valid, so set the sequence.
2622  int seqVal = 0;
2623  switch(translatedSeq[i])
2624  {
2625  case '=':
2626  seqVal = 0;
2627  break;
2628  case 'A':
2629  case 'a':
2630  seqVal = 1;
2631  break;
2632  case 'C':
2633  case 'c':
2634  seqVal = 2;
2635  break;
2636  case 'G':
2637  case 'g':
2638  seqVal = 4;
2639  break;
2640  case 'T':
2641  case 't':
2642  seqVal = 8;
2643  break;
2644  case 'N':
2645  case 'n':
2646  case '.':
2647  seqVal = 15;
2648  break;
2649  default:
2651  "Unknown Sequence character found.");
2652  status = false;
2653  break;
2654  };
2655 
2656  if(i & 1)
2657  {
2658  // Odd number i's go in the lower 4 bits, so OR in the
2659  // lower bits
2660  packedSequence[i/2] |= seqVal;
2661  }
2662  else
2663  {
2664  // Even i's go in the upper 4 bits and are always set first.
2665  packedSequence[i/2] = seqVal << 4;
2666  }
2667  }
2668 
2669  if(!myIsQualityBufferValid)
2670  {
2671  // Set the quality.
2672  if((noQuality) || (myQuality.Length() <= i))
2673  {
2674  // No quality or the quality is smaller than the sequence,
2675  // so set it to 0xFF
2676  packedQuality[i] = 0xFF;
2677  }
2678  else
2679  {
2680  // Copy the quality string.
2681  packedQuality[i] = myQuality[i] - 33;
2682  }
2683  }
2684  }
2685  myPackedSequence = (unsigned char *)myRecordPtr->myData +
2686  myRecordPtr->myReadNameLength +
2687  myRecordPtr->myCigarLength * sizeof(int);
2688  myPackedQuality = myPackedSequence +
2689  (myRecordPtr->myReadLength + 1) / 2;
2690  myIsSequenceBufferValid = true;
2691  myIsQualityBufferValid = true;
2692  myBufferSequenceTranslation = translation;
2693  }
2694 
2695  if(!myIsTagsBufferValid)
2696  {
2697  status &= setTagsInBuffer();
2698  }
2699 
2700  // Set the lengths in the buffer.
2701  myRecordPtr->myReadNameLength = newReadNameLen;
2702  myRecordPtr->myCigarLength = newCigarLen;
2703  myRecordPtr->myReadLength = newReadLen;
2704 
2705  // Set the buffer block size to the size of the buffer minus the
2706  // first field.
2707  myRecordPtr->myBlockSize = newBufferSize - sizeof(int32_t);
2708 
2709  if(status)
2710  {
2711  myIsBufferSynced = true;
2712  }
2713 
2714  return(status);
2715 }
2716 
2717 
2718 // Sets the Sequence and Quality strings from the buffer.
2719 // They are done together in one method because they require the same
2720 // loop, so might as well be done at the same time.
2721 void SamRecord::setSequenceAndQualityFromBuffer()
2722 {
2723  // NOTE: If the sequence buffer is not valid, do not set the sequence
2724  // string from the buffer.
2725  // NOTE: If the quality buffer is not valid, do not set the quality string
2726  // from the buffer.
2727 
2728  // Extract the sequence if the buffer is valid and the string's length is 0.
2729  bool extractSequence = false;
2730  if(myIsSequenceBufferValid && (mySequence.Length() == 0))
2731  {
2732  extractSequence = true;
2733  }
2734 
2735  // Extract the quality if the buffer is valid and the string's length is 0.
2736  bool extractQuality = false;
2737  if(myIsQualityBufferValid && (myQuality.Length() == 0))
2738  {
2739  extractQuality = true;
2740  }
2741 
2742  // If neither the quality nor the sequence need to be extracted,
2743  // just return.
2744  if(!extractSequence && !extractQuality)
2745  {
2746  return;
2747  }
2748 
2749  // Set the sequence and quality strings..
2750  if(extractSequence)
2751  {
2752  mySequence.SetLength(myRecordPtr->myReadLength);
2753  }
2754  if(extractQuality)
2755  {
2756  myQuality.SetLength(myRecordPtr->myReadLength);
2757  }
2758 
2759  const char * asciiBases = "=AC.G...T......N";
2760 
2761  // Flag to see if the quality is specified - the quality contains a value
2762  // other than 0xFF. If all values are 0xFF, then there is no quality.
2763  bool qualitySpecified = false;
2764 
2765  for (int i = 0; i < myRecordPtr->myReadLength; i++)
2766  {
2767  if(extractSequence)
2768  {
2769  mySequence[i] = i & 1 ?
2770  asciiBases[myPackedSequence[i / 2] & 0xF] :
2771  asciiBases[myPackedSequence[i / 2] >> 4];
2772  }
2773 
2774  if(extractQuality)
2775  {
2776  if(myPackedQuality[i] != 0xFF)
2777  {
2778  // Quality is specified, so mark the flag.
2779  qualitySpecified = true;
2780  }
2781 
2782  myQuality[i] = myPackedQuality[i] + 33;
2783  }
2784  }
2785 
2786  // If the read length is 0, then set the sequence and quality to '*'
2787  if(myRecordPtr->myReadLength == 0)
2788  {
2789  if(extractSequence)
2790  {
2791  mySequence = "*";
2792  }
2793  if(extractQuality)
2794  {
2795  myQuality = "*";
2796  }
2797  }
2798  else if(extractQuality && !qualitySpecified)
2799  {
2800  // No quality was specified, so set it to "*"
2801  myQuality = "*";
2802  }
2803 }
2804 
2805 
2806 // Parse the cigar to calculate the alignment/unclipped end.
2807 bool SamRecord::parseCigar()
2808 {
2809  // Determine if the cigar string or cigar binary needs to be parsed.
2810  if(myCigar.Length() == 0)
2811  {
2812  // The cigar string is not yet set, so parse the binary.
2813  return(parseCigarBinary());
2814  }
2815  return(parseCigarString());
2816 }
2817 
2818 // Parse the cigar to calculate the alignment/unclipped end.
2819 bool SamRecord::parseCigarBinary()
2820 {
2821  // Only need to parse if the string is not already set.
2822  // The length of the cigar string is set to zero when the
2823  // record is read from a file into the buffer.
2824  if(myCigar.Length() != 0)
2825  {
2826  // Already parsed.
2827  return(true);
2828  }
2829 
2830  unsigned char * readNameEnds =
2831  (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
2832 
2833  unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds;
2834 
2835  myCigarRoller.Set(packedCigar, myRecordPtr->myCigarLength);
2836 
2837  myCigarRoller.getCigarString(myCigar);
2838 
2839  myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount();
2840 
2841  myUnclippedStartOffset = myCigarRoller.getNumBeginClips();
2842  myUnclippedEndOffset = myCigarRoller.getNumEndClips();
2843 
2844  // if the cigar length is 0, then set the cigar string to "*"
2845  if(myRecordPtr->myCigarLength == 0)
2846  {
2847  myCigar = "*";
2848  return(true);
2849  }
2850 
2851  // Copy the cigar into a temporary buffer.
2852  int newBufferSize = myRecordPtr->myCigarLength * sizeof(uint32_t);
2853  if(newBufferSize > myCigarTempBufferAllocatedSize)
2854  {
2855  uint32_t* tempBufferPtr =
2856  (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
2857  if(tempBufferPtr == NULL)
2858  {
2859  // Failed to allocate memory.
2860  // Do not parse, just return.
2861  fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
2862  myStatus.addError(SamStatus::FAIL_MEM,
2863  "Failed to Allocate Memory.");
2864  return(false);
2865  }
2866  myCigarTempBuffer = tempBufferPtr;
2867  myCigarTempBufferAllocatedSize = newBufferSize;
2868  }
2869 
2870  memcpy(myCigarTempBuffer, packedCigar,
2871  myRecordPtr->myCigarLength * sizeof(uint32_t));
2872 
2873  // Set the length of the temp buffer.
2874  myCigarTempBufferLength = myRecordPtr->myCigarLength;
2875 
2876  return(true);
2877 }
2878 
2879 // Parse the cigar string to calculate the cigar length and alignment end.
2880 bool SamRecord::parseCigarString()
2881 {
2882  myCigarTempBufferLength = 0;
2883  if(myCigar == "*")
2884  {
2885  // Cigar is empty, so initialize the variables.
2886  myAlignmentLength = 0;
2887  myUnclippedStartOffset = 0;
2888  myUnclippedEndOffset = 0;
2889  myCigarRoller.clear();
2890  return(true);
2891  }
2892 
2893  myCigarRoller.Set(myCigar);
2894 
2895  myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount();
2896 
2897  myUnclippedStartOffset = myCigarRoller.getNumBeginClips();
2898  myUnclippedEndOffset = myCigarRoller.getNumEndClips();
2899 
2900  // Check to see if the Temporary Cigar Buffer is large enough to contain
2901  // this cigar. If we make it the size of the length of the cigar string,
2902  // it will be more than large enough.
2903  int newBufferSize = myCigar.Length() * sizeof(uint32_t);
2904  if(newBufferSize > myCigarTempBufferAllocatedSize)
2905  {
2906  uint32_t* tempBufferPtr =
2907  (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
2908  if(tempBufferPtr == NULL)
2909  {
2910  // Failed to allocate memory.
2911  // Do not parse, just return.
2912  fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
2913  myStatus.addError(SamStatus::FAIL_MEM,
2914  "Failed to Allocate Memory.");
2915  return(false);
2916  }
2917  myCigarTempBuffer = tempBufferPtr;
2918  myCigarTempBufferAllocatedSize = newBufferSize;
2919  }
2920 
2921  // Track if there were any errors.
2922  bool status = true;
2923 
2924  // Track the index into the cigar string that is being parsed.
2925  char *cigarOp;
2926  const char* cigarEntryStart = myCigar.c_str();
2927  int opLen = 0;
2928  int op = 0;
2929 
2930  unsigned int * packedCigar = myCigarTempBuffer;
2931  // TODO - maybe one day make a cigar list... or maybe make a
2932  // reference cigar string for ease of lookup....
2933  const char* endCigarString = cigarEntryStart + myCigar.Length();
2934  while(cigarEntryStart < endCigarString)
2935  {
2936  bool validCigarEntry = true;
2937  // Get the opLen from the string. cigarOp will then point to
2938  // the operation.
2939  opLen = strtol(cigarEntryStart, &cigarOp, 10);
2940  // Switch on the type of operation.
2941  switch(*cigarOp)
2942  {
2943  case('M'):
2944  op = 0;
2945  break;
2946  case('I'):
2947  // Insert into the reference position, so do not increment the
2948  // reference end position.
2949  op = 1;
2950  break;
2951  case('D'):
2952  op = 2;
2953  break;
2954  case('N'):
2955  op = 3;
2956  break;
2957  case('S'):
2958  op = 4;
2959  break;
2960  case('H'):
2961  op = 5;
2962  break;
2963  case('P'):
2964  op = 6;
2965  break;
2966  default:
2967  fprintf(stderr, "ERROR parsing cigar\n");
2968  validCigarEntry = false;
2969  status = false;
2971  "Unknown operation found when parsing the Cigar.");
2972  break;
2973  };
2974  if(validCigarEntry)
2975  {
2976  // Increment the cigar length.
2977  ++myCigarTempBufferLength;
2978  *packedCigar = (opLen << 4) | op;
2979  packedCigar++;
2980  }
2981  // The next Entry starts at one past the cigar op, so set the start.
2982  cigarEntryStart = ++cigarOp;
2983  }
2984 
2985  // Check clipLength to adjust the end position.
2986  return(status);
2987 }
2988 
2989 
2990 bool SamRecord::setTagsFromBuffer()
2991 {
2992  // If the tags do not need to be set from the buffer, return true.
2993  if(myNeedToSetTagsFromBuffer == false)
2994  {
2995  // Already been set from the buffer.
2996  return(true);
2997  }
2998 
2999  // Mark false, as they are being set now.
3000  myNeedToSetTagsFromBuffer = false;
3001 
3002  unsigned char * extraPtr = myPackedQuality + myRecordPtr->myReadLength;
3003 
3004  // Default to success, will be changed to false on failure.
3005  bool status = true;
3006 
3007  // Clear any previously set tags.
3008  clearTags();
3009  while (myRecordPtr->myBlockSize + 4 -
3010  (extraPtr - (unsigned char *)myRecordPtr) > 0)
3011  {
3012  int key = 0;
3013  int value = 0;
3014  void * content = extraPtr + 3;
3015  int tagBufferSize = 0;
3016 
3017  key = MAKEKEY(extraPtr[0], extraPtr[1], extraPtr[2]);
3018 
3019  // First check if the tag already exists.
3020  unsigned int location = extras.Find(key);
3021  int origIndex = 0;
3022  String* duplicate = NULL;
3023  String* origTag = NULL;
3024  if(location != LH_NOTFOUND)
3025  {
3026  duplicate = new String;
3027  origTag = new String;
3028  origIndex = extras[location];
3029 
3030  *duplicate = (char)(extraPtr[0]);
3031  *duplicate += (char)(extraPtr[1]);
3032  *duplicate += ':';
3033 
3034  *origTag = *duplicate;
3035  *duplicate += (char)(extraPtr[2]);
3036  *duplicate += ':';
3037  }
3038 
3039  switch (extraPtr[2])
3040  {
3041  case 'A' :
3042  if(duplicate != NULL)
3043  {
3044  *duplicate += (* (char *) content);
3045  *origTag += intType[origIndex];
3046  *origTag += ':';
3047  appendIntArrayValue(origIndex, *origTag);
3048  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3049  integers[origIndex] = *(char *)content;
3050  intType[origIndex] = extraPtr[2];
3051  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3052  }
3053  else
3054  {
3055  value = integers.Length();
3056  integers.Push(* (char *) content);
3057  intType.push_back(extraPtr[2]);
3058  tagBufferSize += 4;
3059  }
3060  extraPtr += 4;
3061  break;
3062  case 'c' :
3063  if(duplicate != NULL)
3064  {
3065  *duplicate += (* (char *) content);
3066  *origTag += intType[origIndex];
3067  *origTag += ':';
3068  appendIntArrayValue(origIndex, *origTag);
3069  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3070  integers[origIndex] = *(char *)content;
3071  intType[origIndex] = extraPtr[2];
3072  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3073  }
3074  else
3075  {
3076  value = integers.Length();
3077  integers.Push(* (char *) content);
3078  intType.push_back(extraPtr[2]);
3079  tagBufferSize += 4;
3080  }
3081  extraPtr += 4;
3082  break;
3083  case 'C' :
3084  if(duplicate != NULL)
3085  {
3086  *duplicate += (* (unsigned char *) content);
3087  *origTag += intType[origIndex];
3088  *origTag += ':';
3089  appendIntArrayValue(origIndex, *origTag);
3090  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3091  integers[origIndex] = *(unsigned char *)content;
3092  intType[origIndex] = extraPtr[2];
3093  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3094  }
3095  else
3096  {
3097  value = integers.Length();
3098  integers.Push(* (unsigned char *) content);
3099  intType.push_back(extraPtr[2]);
3100  tagBufferSize += 4;
3101  }
3102  extraPtr += 4;
3103  break;
3104  case 's' :
3105  if(duplicate != NULL)
3106  {
3107  *duplicate += (* (short *) content);
3108  *origTag += intType[origIndex];
3109  *origTag += ':';
3110  appendIntArrayValue(origIndex, *origTag);
3111  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3112  integers[origIndex] = *(short *)content;
3113  intType[origIndex] = extraPtr[2];
3114  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3115  }
3116  else
3117  {
3118  value = integers.Length();
3119  integers.Push(* (short *) content);
3120  intType.push_back(extraPtr[2]);
3121  tagBufferSize += 5;
3122  }
3123  extraPtr += 5;
3124  break;
3125  case 'S' :
3126  if(duplicate != NULL)
3127  {
3128  *duplicate += (* (unsigned short *) content);
3129  *origTag += intType[origIndex];
3130  *origTag += ':';
3131  appendIntArrayValue(origIndex, *origTag);
3132  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3133  integers[origIndex] = *(unsigned short *)content;
3134  intType[origIndex] = extraPtr[2];
3135  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3136  }
3137  else
3138  {
3139  value = integers.Length();
3140  integers.Push(* (unsigned short *) content);
3141  intType.push_back(extraPtr[2]);
3142  tagBufferSize += 5;
3143  }
3144  extraPtr += 5;
3145  break;
3146  case 'i' :
3147  if(duplicate != NULL)
3148  {
3149  *duplicate += (* (int *) content);
3150  *origTag += intType[origIndex];
3151  *origTag += ':';
3152  appendIntArrayValue(origIndex, *origTag);
3153  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3154  integers[origIndex] = *(int *)content;
3155  intType[origIndex] = extraPtr[2];
3156  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3157  }
3158  else
3159  {
3160  value = integers.Length();
3161  integers.Push(* (int *) content);
3162  intType.push_back(extraPtr[2]);
3163  tagBufferSize += 7;
3164  }
3165  extraPtr += 7;
3166  break;
3167  case 'I' :
3168  if(duplicate != NULL)
3169  {
3170  *duplicate += (* (unsigned int *) content);
3171  *origTag += intType[origIndex];
3172  *origTag += ':';
3173  appendIntArrayValue(origIndex, *origTag);
3174  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3175  integers[origIndex] = *(unsigned int *)content;
3176  intType[origIndex] = extraPtr[2];
3177  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3178  }
3179  else
3180  {
3181  value = integers.Length();
3182  integers.Push((int) * (unsigned int *) content);
3183  intType.push_back(extraPtr[2]);
3184  tagBufferSize += 7;
3185  }
3186  extraPtr += 7;
3187  break;
3188  case 'Z' :
3189  if(duplicate != NULL)
3190  {
3191  *duplicate += ((const char *) content);
3192  *origTag += 'Z';
3193  *origTag += ':';
3194  *origTag += (char*)(strings[origIndex]);
3195  tagBufferSize -= strings[origIndex].Length();
3196  strings[origIndex] = (const char *) content;
3197  extraPtr += 4 + strings[origIndex].Length();
3198  tagBufferSize += strings[origIndex].Length();
3199  }
3200  else
3201  {
3202  value = strings.Length();
3203  strings.Push((const char *) content);
3204  tagBufferSize += 4 + strings.Last().Length();
3205  extraPtr += 4 + strings.Last().Length();
3206  }
3207  break;
3208  case 'B' :
3209  if(duplicate != NULL)
3210  {
3211  *origTag += 'B';
3212  *origTag += ':';
3213  *origTag += (char*)(strings[origIndex]);
3214  tagBufferSize -=
3215  getBtagBufferSize(strings[origIndex]);
3216  int bufferSize =
3217  getStringFromBtagBuffer((unsigned char*)content,
3218  strings[origIndex]);
3219  *duplicate += (char *)(strings[origIndex]);
3220  tagBufferSize += bufferSize;
3221  extraPtr += 3 + bufferSize;
3222  }
3223  else
3224  {
3225  value = strings.Length();
3226  String tempBTag;
3227  int bufferSize =
3228  getStringFromBtagBuffer((unsigned char*)content,
3229  tempBTag);
3230  strings.Push(tempBTag);
3231  tagBufferSize += 3 + bufferSize;
3232  extraPtr += 3 + bufferSize;
3233  }
3234  break;
3235  case 'f' :
3236  if(duplicate != NULL)
3237  {
3238  duplicate->appendFullFloat(* (float *) content);
3239  *origTag += 'f';
3240  *origTag += ':';
3241  origTag->appendFullFloat(floats[origIndex]);
3242  floats[origIndex] = *(float *)content;
3243  }
3244  else
3245  {
3246  value = floats.size();
3247  floats.push_back(* (float *) content);
3248  tagBufferSize += 7;
3249  }
3250  extraPtr += 7;
3251  break;
3252  default :
3253  fprintf(stderr,
3254  "parsing BAM - Unknown custom field of type %c%c:%c\n",
3255  extraPtr[0], extraPtr[1], extraPtr[2]);
3256  fprintf(stderr, "BAM Tags: \n");
3257 
3258  unsigned char* tagInfo = myPackedQuality + myRecordPtr->myReadLength;
3259 
3260  fprintf(stderr, "\n\n");
3261  tagInfo = myPackedQuality + myRecordPtr->myReadLength;
3262  while(myRecordPtr->myBlockSize + 4 -
3263  (tagInfo - (unsigned char *)myRecordPtr) > 0)
3264  {
3265  fprintf(stderr, "%02x",tagInfo[0]);
3266  ++tagInfo;
3267  }
3268  fprintf(stderr, "\n");
3269 
3270  // Failed on read.
3271  // Increment extraPtr just by the size of the 3 known fields
3272  extraPtr += 3;
3274  "Unknown tag type.");
3275  status = false;
3276  }
3277 
3278  if(duplicate != NULL)
3279  {
3280  // Duplicate tag in this record.
3281  // Tag already existed, print message about overwriting.
3282  // WARN about dropping duplicate tags.
3283  if(myNumWarns++ < myMaxWarns)
3284  {
3285  fprintf(stderr, "WARNING: Duplicate Tags, overwritting %s with %s\n",
3286  origTag->c_str(), duplicate->c_str());
3287  if(myNumWarns == myMaxWarns)
3288  {
3289  fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n");
3290  }
3291  }
3292 
3293  continue;
3294  }
3295 
3296  // Only add the tag if it has so far been successfully processed.
3297  if(status)
3298  {
3299  // Add the tag.
3300  extras.Add(key, value);
3301  myTagBufferSize += tagBufferSize;
3302  }
3303  }
3304  return(status);
3305 }
3306 
3307 
3308 bool SamRecord::setTagsInBuffer()
3309 {
3310  // The buffer size extends from the start of the record to data
3311  // plus the length of the variable fields,
3312  // Multiply the cigar length by 4 since it is the number of uint32_t fields.
3313  int bamSequenceLength = (myRecordPtr->myReadLength+1)/2;
3314  int newBufferSize = ((unsigned char*)(&(myRecordPtr->myData)) -
3315  (unsigned char*)myRecordPtr) +
3316  myRecordPtr->myReadNameLength + ((myRecordPtr->myCigarLength)*4) +
3317  myRecordPtr->myReadLength + bamSequenceLength + myTagBufferSize;
3318 
3319  // Make sure the buffer is big enough.
3320  if(!allocateRecordStructure(newBufferSize))
3321  {
3322  // Failed to allocate space.
3323  return(false);
3324  }
3325 
3326  char * extraPtr = (char*)myPackedQuality + myRecordPtr->myReadLength;
3327 
3328  bool status = true;
3329 
3330  // Set the tags in the buffer.
3331  if (extras.Entries())
3332  {
3333  for (int i = 0; i < extras.Capacity(); i++)
3334  {
3335  if (extras.SlotInUse(i))
3336  {
3337  int key = extras.GetKey(i);
3338  getTag(key, extraPtr);
3339  extraPtr += 2;
3340  char vtype;
3341  getTypeFromKey(key, vtype);
3342 
3343  if(vtype == 'i')
3344  {
3345  vtype = getIntegerType(i);
3346  }
3347 
3348  extraPtr[0] = vtype;
3349 
3350  // increment the pointer to where the value is.
3351  extraPtr += 1;
3352 
3353  switch (vtype)
3354  {
3355  case 'A' :
3356  *(char*)extraPtr = (char)getInteger(i);
3357  // sprintf(extraPtr, "%d", getInteger(i));
3358  extraPtr += 1;
3359  break;
3360  case 'c' :
3361  *(int8_t*)extraPtr = (int8_t)getInteger(i);
3362  // sprintf(extraPtr, "%.4d", getInteger(i));
3363  extraPtr += 1;
3364  break;
3365  case 'C' :
3366  *(uint8_t*)extraPtr = (uint8_t)getInteger(i);
3367  // sprintf(extraPtr, "%.4d", getInteger(i));
3368  extraPtr += 1;
3369  break;
3370  case 's' :
3371  *(int16_t*)extraPtr = (int16_t)getInteger(i);
3372  // sprintf(extraPtr, "%.4d", getInteger(i));
3373  extraPtr += 2;
3374  break;
3375  case 'S' :
3376  *(uint16_t*)extraPtr = (uint16_t)getInteger(i);
3377  // sprintf(extraPtr, "%.4d", getInteger(i));
3378  extraPtr += 2;
3379  break;
3380  case 'i' :
3381  *(int32_t*)extraPtr = (int32_t)getInteger(i);
3382  // sprintf(extraPtr, "%.4d", getInteger(i));
3383  extraPtr += 4;
3384  break;
3385  case 'I' :
3386  *(uint32_t*)extraPtr = (uint32_t)getInteger(i);
3387  // sprintf(extraPtr, "%.4d", getInteger(i));
3388  extraPtr += 4;
3389  break;
3390  case 'Z' :
3391  sprintf(extraPtr, "%s", getString(i).c_str());
3392  extraPtr += getString(i).Length() + 1;
3393  break;
3394  case 'B' :
3395  extraPtr += setBtagBuffer(getString(i), extraPtr);
3396  //--TODO-- Set buffer with correct B tag
3397  //sprintf(extraPtr, "%s", getString(i).c_str());
3398  // extraPtr += getBtagBufferSize(getString(i));
3399  break;
3400  case 'f' :
3401  *(float*)extraPtr = getFloat(i);
3402  extraPtr += 4;
3403  break;
3404  default :
3406  "Unknown tag type.");
3407  status = false;
3408  break;
3409  }
3410  }
3411  }
3412  }
3413 
3414  // Validate that the extra pointer is at the end of the allocated buffer.
3415  // If not then there was a problem.
3416  if(extraPtr != (char*)myRecordPtr + newBufferSize)
3417  {
3418  fprintf(stderr, "ERROR updating the buffer. Incorrect size.");
3420  "ERROR updating the buffer. Incorrect size.");
3421  status = false;
3422  }
3423 
3424 
3425  // The buffer tags are now in sync.
3426  myNeedToSetTagsInBuffer = false;
3427  myIsTagsBufferValid = true;
3428  return(status);
3429 }
3430 
3431 
3432 // Reset the variables for a newly set buffer. The buffer must be set first
3433 // since this looks up the reference ids in the buffer to set the reference
3434 // names.
3435 void SamRecord::setVariablesForNewBuffer(SamFileHeader& header)
3436 {
3437  // Lookup the reference name & mate reference name associated with this
3438  // record.
3439  myReferenceName =
3440  header.getReferenceLabel(myRecordPtr->myReferenceID);
3441  myMateReferenceName =
3442  header.getReferenceLabel(myRecordPtr->myMateReferenceID);
3443 
3444  // Clear the SAM Strings that are now not in-sync with the buffer.
3445  myReadName.SetLength(0);
3446  myCigar.SetLength(0);
3447  mySequence.SetLength(0);
3448  mySeqWithEq.clear();
3449  mySeqWithoutEq.clear();
3450  myQuality.SetLength(0);
3451  myNeedToSetTagsFromBuffer = true;
3452  myNeedToSetTagsInBuffer = false;
3453 
3454  //Set that the buffer is valid.
3455  myIsBufferSynced = true;
3456  // Set that the variable length buffer fields are valid.
3457  myIsReadNameBufferValid = true;
3458  myIsCigarBufferValid = true;
3459  myPackedSequence = (unsigned char *)myRecordPtr->myData +
3460  myRecordPtr->myReadNameLength + myRecordPtr->myCigarLength * sizeof(int);
3461  myIsSequenceBufferValid = true;
3462  myBufferSequenceTranslation = NONE;
3463  myPackedQuality = myPackedSequence +
3464  (myRecordPtr->myReadLength + 1) / 2;
3465  myIsQualityBufferValid = true;
3466  myIsTagsBufferValid = true;
3467  myIsBinValid = true;
3468 }
3469 
3470 
3471 // Extract the vtype from the key.
3472 void SamRecord::getTypeFromKey(int key, char& type) const
3473 {
3474  // Extract the type from the key.
3475  type = (key >> 16) & 0xFF;
3476 }
3477 
3478 
3479 // Extract the tag from the key.
3480 void SamRecord::getTag(int key, char* tag) const
3481 {
3482  // Extract the tag from the key.
3483  tag[0] = key & 0xFF;
3484  tag[1] = (key >> 8) & 0xFF;
3485  tag[2] = 0;
3486 }
3487 
3488 
3489 // Index is the index into the strings array.
3490 String & SamRecord::getString(int index)
3491 {
3492  int value = extras[index];
3493 
3494  return strings[value];
3495 }
3496 
3497 int & SamRecord::getInteger(int offset)
3498 {
3499  int value = extras[offset];
3500 
3501  return integers[value];
3502 }
3503 
3504 const char & SamRecord::getIntegerType(int offset) const
3505 {
3506  int value = extras[offset];
3507 
3508  return intType[value];
3509 }
3510 
3511 float & SamRecord::getFloat(int offset)
3512 {
3513  int value = extras[offset];
3514 
3515  return floats[value];
3516  }
3517 
3518 
3519 void SamRecord::appendIntArrayValue(char type, int value, String& strVal) const
3520 {
3521  switch(type)
3522  {
3523  case 'A':
3524  strVal += (char)value;
3525  break;
3526  case 'c':
3527  case 's':
3528  case 'i':
3529  case 'C':
3530  case 'S':
3531  case 'I':
3532  strVal += value;
3533  break;
3534  default:
3535  // Do nothing.
3536  ;
3537  }
3538 }
3539 
3540 
3541 int SamRecord::getBtagBufferSize(String& tagStr)
3542 {
3543  if(tagStr.Length() < 1)
3544  {
3545  // ERROR, needs at least the type.
3546  myStatus.addError(SamStatus::FAIL_PARSE,
3547  "SamRecord::getBtagBufferSize no tag subtype specified");
3548  return(0);
3549  }
3550  char type = tagStr[0];
3551  int elementSize = getNumericTagTypeSize(type);
3552  if(elementSize <= 0)
3553  {
3554  // ERROR, 'B' tag subtype must be numeric, so should be non-zero
3555  String errorMsg = "SamRecord::getBtagBufferSize invalid tag subtype, ";
3556  errorMsg += type;
3557  myStatus.addError(SamStatus::FAIL_PARSE, errorMsg.c_str());
3558  return(0);
3559  }
3560 
3561  // Separated by ',', so count the number of commas.
3562  int numElements = 0;
3563  int index = tagStr.FastFindChar(',', 0);
3564  while(index > 0)
3565  {
3566  ++numElements;
3567  index = tagStr.FastFindChar(',', index+1);
3568  }
3569  // Add 5 bytes: type & numElements
3570  return(numElements * elementSize + 5);
3571 }
3572 
3573 
3574 int SamRecord::setBtagBuffer(String& tagStr, char* extraPtr)
3575 {
3576  if(tagStr.Length() < 1)
3577  {
3578  // ERROR, needs at least the type.
3579  myStatus.addError(SamStatus::FAIL_PARSE,
3580  "SamRecord::getBtagBufferSize no tag subtype specified");
3581  return(0);
3582  }
3583  char type = tagStr[0];
3584  int elementSize = getNumericTagTypeSize(type);
3585  if(elementSize <= 0)
3586  {
3587  // ERROR, 'B' tag subtype must be numeric, so should be non-zero
3588  String errorMsg = "SamRecord::getBtagBufferSize invalid tag subtype, ";
3589  errorMsg += type;
3590  myStatus.addError(SamStatus::FAIL_PARSE, errorMsg.c_str());
3591  return(0);
3592  }
3593 
3594  int totalInc = 0;
3595  // Write the type.
3596  *(char*)extraPtr = type;
3597  ++extraPtr;
3598  ++totalInc;
3599 
3600  // Get the number of elements by counting ','s
3601  uint32_t numElements = 0;
3602  int index = tagStr.FastFindChar(',', 0);
3603  while(index > 0)
3604  {
3605  ++numElements;
3606  index = tagStr.FastFindChar(',', index+1);
3607  }
3608  *(uint32_t*)extraPtr = numElements;
3609  extraPtr += 4;
3610  totalInc += 4;
3611 
3612  const char* stringPtr = tagStr.c_str();
3613  const char* endPtr = stringPtr + tagStr.Length();
3614  // increment past the subtype and ','.
3615  stringPtr += 2;
3616 
3617  char* newPtr = NULL;
3618  while(stringPtr < endPtr)
3619  {
3620  switch(type)
3621  {
3622  case 'f':
3623  *(float*)extraPtr = (float)(strtod(stringPtr, &newPtr));
3624  break;
3625  case 'c':
3626  *(int8_t*)extraPtr = (int8_t)strtol(stringPtr, &newPtr, 0);
3627  break;
3628  case 's':
3629  *(int16_t*)extraPtr = (int16_t)strtol(stringPtr, &newPtr, 0);
3630  break;
3631  case 'i':
3632  *(int32_t*)extraPtr = (int32_t)strtol(stringPtr, &newPtr, 0);
3633  break;
3634  case 'C':
3635  *(uint8_t*)extraPtr = (uint8_t)strtoul(stringPtr, &newPtr, 0);
3636  break;
3637  case 'S':
3638  *(uint16_t*)extraPtr = (uint16_t)strtoul(stringPtr, &newPtr, 0);
3639  break;
3640  case 'I':
3641  *(uint32_t*)extraPtr = (uint32_t)strtoul(stringPtr, &newPtr, 0);
3642  break;
3643  default :
3645  "Unknown 'B' tag subtype.");
3646  break;
3647  }
3648  extraPtr += elementSize;
3649  totalInc += elementSize;
3650  stringPtr = newPtr + 1; // skip the ','
3651  }
3652  return(totalInc);
3653 }
3654 
3655 
3656 int SamRecord::getStringFromBtagBuffer(unsigned char* buffer,
3657  String& tagStr)
3658 {
3659  tagStr.Clear();
3660 
3661  int bufferSize = 0;
3662 
3663  // 1st byte is the type.
3664  char type = *buffer;
3665  ++buffer;
3666  ++bufferSize;
3667  tagStr = type;
3668 
3669  // 2nd-5th bytes are the size
3670  unsigned int numEntries = *(unsigned int *)buffer;
3671  buffer += 4;
3672  bufferSize += 4;
3673  // Num Entries is not included in the string.
3674 
3675  int subtypeSize = getNumericTagTypeSize(type);
3676 
3677  for(unsigned int i = 0; i < numEntries; i++)
3678  {
3679  tagStr += ',';
3680  switch(type)
3681  {
3682  case 'f':
3683  tagStr.appendFullFloat(*(float *)buffer);
3684  break;
3685  case 'c':
3686  tagStr += *(int8_t *)buffer;
3687  break;
3688  case 's':
3689  tagStr += *(int16_t *)buffer;
3690  break;
3691  case 'i':
3692  tagStr += *(int32_t *)buffer;
3693  break;
3694  case 'C':
3695  tagStr += *(uint8_t *)buffer;
3696  break;
3697  case 'S':
3698  tagStr += *(uint16_t *)buffer;
3699  break;
3700  case 'I':
3701  tagStr += *(uint32_t *)buffer;
3702  break;
3703  default :
3705  "Unknown 'B' tag subtype.");
3706  break;
3707  }
3708  buffer += subtypeSize;
3709  bufferSize += subtypeSize;
3710  }
3711  return(bufferSize);
3712 }
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition: InputFile.h:654
unsigned int ifread(IFILE file, void *buffer, unsigned int size)
Read up to size bytes from the file into the buffer.
Definition: InputFile.h:600
unsigned int ifwrite(IFILE file, const void *buffer, unsigned int size)
Write the specified number of bytes from the specified buffer into the file.
Definition: InputFile.h:669
static const char UNKNOWN_QUALITY_CHAR
Character used when the quality is unknown.
Definition: BaseUtilities.h:49
bool Remove(int index)
Remove the operation at the specified index.
bool IncrementCount(int index, int increment)
Increments the count for the operation at the specified index by the specified value,...
bool Update(int index, Operation op, int count)
Updates the operation at the specified index to be the specified operation and have the specified cou...
void clear()
Clear this object so that it has no Cigar Operations.
void Set(const char *cigarString)
Sets this object to the specified cigarString.
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition: Cigar.h:84
int size() const
Return the number of cigar operations.
Definition: Cigar.h:364
static bool isMatchOrMismatch(Operation op)
Return true if the specified operation is a match/mismatch operation, false if not.
Definition: Cigar.h:298
static bool foundInQuery(Operation op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:219
int getNumBeginClips() const
Return the number of clips that are at the beginning of the cigar.
Definition: Cigar.cpp:144
int getNumEndClips() const
Return the number of clips that are at the end of the cigar.
Definition: Cigar.cpp:166
int getExpectedReferenceBaseCount() const
Return the number of bases in the reference that this CIGAR "spans".
Definition: Cigar.cpp:120
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos)
Return the number of bases that overlap the reference and the read associated with this cigar that fa...
Definition: Cigar.cpp:334
void getCigarString(String &cigarString) const
Set the passed in String to the string reprentation of the Cigar operations in this object.
Definition: Cigar.cpp:52
HandlingType
This specifies how this class should respond to errors.
Definition: ErrorHandler.h:29
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
const char * getFileName() const
Get the filename that is currently opened.
Definition: InputFile.h:473
bool isOpen() const
Returns whether or not the file was successfully opened.
Definition: InputFile.h:423
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:35
int getReferenceID(const String &referenceName, bool addID=false)
Get the reference ID for the specified reference name (chromosome).
const String & getReferenceLabel(int id) const
Return the reference name (chromosome) for the specified reference id.
static void seqWithoutEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence converting '=' to the appropriate base using the reference.
static void seqWithEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence with '=' in any position where the sequence matches the reference.
int32_t getBlockSize()
Get the block size of the record (BAM format).
Definition: SamRecord.cpp:1281
uint16_t getCigarLength()
Get the length of the BAM formatted CIGAR.
Definition: SamRecord.cpp:1362
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1298
SequenceTranslation
Enum containing the settings on how to translate the sequence if a reference is available.
Definition: SamRecord.h:57
@ NONE
Leave the sequence as is.
Definition: SamRecord.h:58
@ BASES
Translate '=' to the actual base.
Definition: SamRecord.h:60
@ EQUAL
Translate bases that match the reference to '='.
Definition: SamRecord.h:59
bool setReadName(const char *readName)
Set QNAME to the passed in name.
Definition: SamRecord.cpp:193
int32_t getInsertSize()
Get the inferred insert size of the read pair (ISIZE) or observed template length (TLEN).
Definition: SamRecord.cpp:1459
int32_t get0BasedMatePosition()
Get the 0-based(BAM) leftmost mate/next fragment's position.
Definition: SamRecord.cpp:1452
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
Definition: SamRecord.cpp:1312
void clearTags()
Clear the tags in this record.
Definition: SamRecord.cpp:977
bool addIntTag(const char *tag, int32_t value)
Add the specified integer tag to the record.
Definition: SamRecord.cpp:647
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
Definition: SamRecord.cpp:1305
bool getTagsString(const char *tags, String &returnString, char delim='\t')
Get the string representation of the tags from the record, formatted as TAG:TYPE:VALUE<delim>TAG:TYPE...
Definition: SamRecord.cpp:2082
GenomeSequence * getReference()
Returns a pointer to the genome sequence object associated with this record if it was set (NULL if it...
Definition: SamRecord.cpp:1923
int32_t getAlignmentLength()
Returns the length of the clipped sequence, returning 0 if the cigar is '*'.
Definition: SamRecord.cpp:1493
int & getInteger(const char *tag)
Get the integer value for the specified tag, DEPRECATED, use getIntegerTag that returns a bool.
Definition: SamRecord.cpp:2350
bool setInsertSize(int32_t insertSize)
Sets the inferred insert size (ISIZE)/observed template length (TLEN).
Definition: SamRecord.cpp:336
int32_t get1BasedAlignmentEnd()
Returns the 1-based inclusive rightmost position of the clipped sequence.
Definition: SamRecord.cpp:1486
uint32_t getTagLength()
Returns the length of the BAM formatted tags.
Definition: SamRecord.cpp:1929
SamRecord()
Default Constructor.
Definition: SamRecord.cpp:34
static bool isIntegerType(char vtype)
Returns whether or not the specified vtype is an integer type.
Definition: SamRecord.cpp:2040
bool rmTag(const char *tag, char type)
Remove a tag.
Definition: SamRecord.cpp:992
bool setMateReferenceName(SamFileHeader &header, const char *mateReferenceName)
Set the mate/next fragment's reference sequence name (RNEXT) to the specified name,...
Definition: SamRecord.cpp:297
uint8_t getReadNameLength()
Get the length of the readname (QNAME) including the null.
Definition: SamRecord.cpp:1326
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
Definition: SamRecord.cpp:1836
bool getFloatTag(const char *tag, float &tagVal)
Get the float value for the specified tag.
Definition: SamRecord.cpp:2281
SamStatus::Status writeRecordBuffer(IFILE filePtr)
Write the record as a BAM into the specified already opened file.
Definition: SamRecord.cpp:1237
const char * getMateReferenceNameOrEqual()
Get the mate/next fragment's reference sequence name (RNEXT), returning "=" if it is the same as the ...
Definition: SamRecord.cpp:1420
bool setMapQuality(uint8_t mapQuality)
Set the mapping quality (MAPQ).
Definition: SamRecord.cpp:251
static bool isFloatType(char vtype)
Returns whether or not the specified vtype is a float type.
Definition: SamRecord.cpp:2052
SamStatus::Status setBuffer(const char *fromBuffer, uint32_t fromBufferSize, SamFileHeader &header)
Sets the SamRecord to contain the information in the BAM formatted fromBuffer.
Definition: SamRecord.cpp:525
int32_t get1BasedUnclippedStart()
Returns the 1-based inclusive left-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1519
bool addTag(const char *tag, char vtype, const char *value)
Add the specified tag,vtype,value to the record.
Definition: SamRecord.cpp:791
uint16_t getBin()
Get the BAM bin for the record.
Definition: SamRecord.cpp:1347
bool isValid(SamFileHeader &header)
Returns whether or not the record is valid, setting the status to indicate success or failure.
Definition: SamRecord.cpp:161
int32_t getMateReferenceID()
Get the mate reference id of the record (BAM format: mate_rid/next_refID).
Definition: SamRecord.cpp:1438
bool getFields(bamRecordStruct &recStruct, String &readName, String &cigar, String &sequence, String &quality)
Returns the values of all fields except the tags.
Definition: SamRecord.cpp:1866
bool set0BasedMatePosition(int32_t matePosition)
Set the mate/next fragment's leftmost position using the specified 0-based (BAM format) value.
Definition: SamRecord.cpp:328
void resetRecord()
Reset the fields of the record to a default value.
Definition: SamRecord.cpp:91
bool setFlag(uint16_t flag)
Set the bitwise FLAG to the specified value.
Definition: SamRecord.cpp:215
bool set1BasedPosition(int32_t position)
Set the leftmost position (POS) using the specified 1-based (SAM format) value.
Definition: SamRecord.cpp:236
SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader &header)
Read the BAM record from a file.
Definition: SamRecord.cpp:558
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1384
const void * getRecordBuffer()
Get a const pointer to the buffer that contains the BAM representation of the record.
Definition: SamRecord.cpp:1204
void setSequenceTranslation(SequenceTranslation translation)
Set the type of sequence translation to use when getting the sequence.
Definition: SamRecord.cpp:187
int32_t get1BasedMatePosition()
Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT).
Definition: SamRecord.cpp:1445
int32_t get0BasedUnclippedEnd()
Returns the 0-based inclusive right-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1526
bool shiftIndelsLeft()
Shift the indels (if any) to the left by updating the CIGAR.
Definition: SamRecord.cpp:368
int * getIntegerTag(const char *tag)
Get the integer value for the specified tag, DEPRECATED, use one that returns a bool (success/failure...
Definition: SamRecord.cpp:2216
const SamStatus & getStatus()
Returns the status associated with the last method that sets the status.
Definition: SamRecord.cpp:2403
static bool isCharType(char vtype)
Returns whether or not the specified vtype is a char type.
Definition: SamRecord.cpp:2062
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
Definition: SamRecord.cpp:259
int32_t get1BasedUnclippedEnd()
Returns the 1-based inclusive right-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1535
uint32_t getNumOverlaps(int32_t start, int32_t end)
Return the number of bases in this read that overlap the passed in region.
Definition: SamRecord.cpp:1853
const char * getMateReferenceName()
Get the mate/next fragment's reference sequence name (RNEXT).
Definition: SamRecord.cpp:1410
bool checkTag(const char *tag, char type)
Check if the specified tag contains a value of the specified vtype.
Definition: SamRecord.cpp:2381
bool getNextSamTag(char *tag, char &vtype, void **value)
Get the next tag from the record.
Definition: SamRecord.cpp:1962
void setReference(GenomeSequence *reference)
Set the reference to the specified genome sequence object.
Definition: SamRecord.cpp:178
bool setSequence(const char *seq)
Sets the sequence (SEQ) to the specified SAM formatted sequence string.
Definition: SamRecord.cpp:344
int32_t get0BasedUnclippedStart()
Returns the 0-based inclusive left-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1506
int32_t getReadLength()
Get the length of the read.
Definition: SamRecord.cpp:1391
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
Definition: SamRecord.cpp:1467
const String * getStringTag(const char *tag)
Get the string value for the specified tag.
Definition: SamRecord.cpp:2180
bool set1BasedMatePosition(int32_t matePosition)
Set the mate/next fragment's leftmost position (PNEXT) using the specified 1-based (SAM format) value...
Definition: SamRecord.cpp:322
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1319
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1555
uint8_t getMapQuality()
Get the mapping quality (MAPQ) of the record.
Definition: SamRecord.cpp:1340
const String & getString(const char *tag)
Get the string value for the specified tag.
Definition: SamRecord.cpp:2314
bool set0BasedPosition(int32_t position)
Set the leftmost position using the specified 0-based (BAM format) value.
Definition: SamRecord.cpp:242
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
Definition: SamRecord.cpp:1542
void resetTagIter()
Reset the tag iterator to the beginning of the tags.
Definition: SamRecord.cpp:2034
bool setQuality(const char *quality)
Sets the quality (QUAL) to the specified SAM formatted quality string.
Definition: SamRecord.cpp:357
bool setReferenceName(SamFileHeader &header, const char *referenceName)
Set the reference sequence name (RNAME) to the specified name, using the header to determine the refe...
Definition: SamRecord.cpp:223
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
Definition: SamRecord.cpp:1638
~SamRecord()
Destructor.
Definition: SamRecord.cpp:72
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
Definition: SamRecord.cpp:1568
bool rmTags(const char *tags)
Remove tags.
Definition: SamRecord.cpp:1083
static bool isStringType(char vtype)
Returns whether or not the specified vtype is a string type.
Definition: SamRecord.cpp:2072
The SamValidationErrors class is a container class that holds SamValidationError Objects,...
void getErrorString(std::string &errorString) const
Append the error messages contained in this container to the passed in string.
static bool isValid(SamFileHeader &samHeader, SamRecord &samRecord, SamValidationErrors &validationErrors)
Validates whether or not the specified SamRecord is valid, calling all of the other validations.
This class is used to track the status results of some methods in the BAM classes.
Definition: StatGenStatus.h:27
Status
Return value enum for StatGenFile methods.
Definition: StatGenStatus.h:32
@ FAIL_MEM
fail a memory allocation.
Definition: StatGenStatus.h:45
@ NO_MORE_RECS
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...
Definition: StatGenStatus.h:36
@ SUCCESS
method completed successfully.
Definition: StatGenStatus.h:32
@ FAIL_IO
method failed due to an I/O issue.
Definition: StatGenStatus.h:37
@ INVALID
invalid other than for sorting.
Definition: StatGenStatus.h:44
@ FAIL_PARSE
failed to parse a record/header - invalid format.
Definition: StatGenStatus.h:42
@ FAIL_ORDER
FAIL_ORDER: method failed because it was called out of order, like trying to read a file without open...
Definition: StatGenStatus.h:41
void setStatus(Status newStatus, const char *newMessage)
Set the status with the specified status enum and message.
Status getStatus() const
Return the enum for this status object.
void addError(Status newStatus, const char *newMessage)
Add the specified error message to the status message, setting the status to newStatus if the current...
Structure of a BAM record.
Definition: SamRecord.h:34