libStatGen Software  1
SamFileHeader.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "SamFileHeader.h"
19 #include "SamHeaderSQ.h"
20 #include "SamHeaderRG.h"
21 
22 
23 const std::string SamFileHeader::EMPTY_RETURN = "";
24 
25 SamFileHeader::SamFileHeader()
26  : myHD(NULL),
27  myReferenceInfo(),
28  myErrorMessage("")
29 {
30  resetHeader();
31 
32  mySQs.setCaseSensitive(true);
33  myRGs.setCaseSensitive(true);
34  myPGs.setCaseSensitive(true);
35 }
36 
37 
38 SamFileHeader::~SamFileHeader()
39 {
40  resetHeader();
41 }
42 
43 
44 // Copy Constructor
45 SamFileHeader::SamFileHeader(const SamFileHeader& header)
46 {
47  copy(header);
48 }
49 
50 
51 // Overload operator = to copy the passed in header into this header.
53 {
54  copy(header);
55  return(*this);
56 }
57 
58 
60 {
61  // Check to see if the passed in value is the same as this.
62  if(this == &header)
63  {
64  return(true);
65  }
66 
67  resetHeader();
68 
69  // Copy the records by getting the other header's header string
70  // and parsing it.
71  std::string newString;
72  bool status = header.getHeaderString(newString);
73  String newHeaderString = newString.c_str();
74 
75  status &= parseHeader(newHeaderString);
76 
77  myCurrentHeaderIndex = header.myCurrentHeaderIndex;
78  myCurrentCommentIndex = header.myCurrentCommentIndex;
79 
80  // Clear the reference info and copy it to ensure it is the same.
81  myReferenceInfo.clear();
82  // Copy Reference contigs, hash, lengths.
83  myReferenceInfo = header.myReferenceInfo;
84 
85  return(status);
86 }
87 
88 
89 // Reset the header for a new entry, clearing out previous values.
91 {
92  myReferenceInfo.clear();
93 
94  // Clear the pointers to the header records. They are deleted when the
95  // vector is cleaned up.
96  myHD = NULL;
97  mySQs.Clear();
98  myRGs.Clear();
99  myPGs.Clear();
100 
101  // Delete the header records and clear the vector.
102  for(unsigned int headerIndex = 0; headerIndex < myHeaderRecords.size();
103  headerIndex++)
104  {
105  if(myHeaderRecords[headerIndex] != NULL)
106  {
107  delete myHeaderRecords[headerIndex];
108  myHeaderRecords[headerIndex] = NULL;
109  }
110  }
111  myHeaderRecords.clear();
112 
113  // Reset the iterator for the header lines.
115 
116  // Reset the comment iterator.
118 
119  // Reset the individual type header iterators.
123 
124  // Clear the comments
125  myComments.clear();
126 }
127 
128 
129 // Set the passed in string to the entire header string. Clearing its
130 // current contents.
131 bool SamFileHeader::getHeaderString(std::string& header) const
132 {
133  header.clear();
134 
135  // Keep getting header lines until there are no more - false returned.
136  unsigned int index = 0;
137  while(getHeaderLine(index, header) != false)
138  {
139  ++index;
140  }
141 
142  return(true);
143 }
144 
145 
146 int SamFileHeader::getReferenceID(const String & referenceName, bool addID)
147 {
148  return(myReferenceInfo.getReferenceID(referenceName, addID));
149 }
150 
151 
152 int SamFileHeader::getReferenceID(const char* referenceName, bool addID)
153 {
154  return(myReferenceInfo.getReferenceID(referenceName, addID));
155 }
156 
157 
159 {
160  return(myReferenceInfo.getReferenceLabel(id));
161 }
162 
163 
164 // Get the Reference Information
166 {
167  return(myReferenceInfo);
168 }
169 
170 
171 // Get the Reference Information for updating separately when reading
172 // BAMs...should only be called by BamInterface.
173 SamReferenceInfo& SamFileHeader::getReferenceInfoForBamInterface()
174 {
175  return(myReferenceInfo);
176 }
177 
178 
179 // Add a header line that has an const char* value.
180 bool SamFileHeader::addHeaderLine(const char* type, const char* tag,
181  const char* value)
182 {
183  String headerLine;
184  headerLine += "@";
185  headerLine += type;
186  headerLine += "\t";
187  headerLine += tag;
188  headerLine += ":";
189  headerLine += value;
190  return(addHeaderLine(headerLine.c_str()));
191 }
192 
193 
194 // Add a header line that is already preformatted in a const char*.
195 bool SamFileHeader::addHeaderLine(const char* headerLine)
196 {
197  // Parse the added header line.
198  String headerString = headerLine;
199  return(parseHeader(headerString));
200 }
201 
202 
203 // Add a header line that is already preformatted in a const char*.
204 bool SamFileHeader::addHeader(const char* header)
205 {
206  // Parse the added header line.
207  String headerString = header;
208  return(parseHeader(headerString));
209 }
210 
211 
212 // Add a comment.
213 bool SamFileHeader::addComment(const char* comment)
214 {
215  if((comment != NULL) && (strcmp(comment, EMPTY_RETURN.c_str()) != 0))
216  {
217  // Valid comment, so add it.
218  myComments.push_back(comment);
219  }
220  return(true);
221 }
222 
223 
224 // Add the specified tag and value to the HD header.
225 bool SamFileHeader::setHDTag(const char* tag, const char* value)
226 {
227  if(myHD == NULL)
228  {
229  // Need to create the HD line.
230  myHD = new SamHeaderHD();
231  if(myHD == NULL)
232  {
233  // New failed, return false.
234  myErrorMessage = "SamFileHeader: Failed to allocate a new HD tag";
235  return(false);
236  }
237  // Succeeded to create the line, add it to the
238  // list.
239  myHeaderRecords.push_back(myHD);
240  }
241  if(!myHD->setTag(tag, value))
242  {
243  myErrorMessage = "SamFileHeader: Failed to set the specified HD tag";
244  return(false);
245  }
246  return(true);
247 }
248 
249 
250 // Add the specified tag and value to the SQ header with the specified name.
251 // If the header does not yet exist, the header is added.
252 bool SamFileHeader::setSQTag(const char* tag, const char* value,
253  const char* name)
254 {
255  // Get the SQ record for the specified name.
256  SamHeaderSQ* sq = getSQ(name);
257  if(sq == NULL)
258  {
259  // The SQ does not yet exist.
260  // Make sure the tag is LN.
261  if(strcmp(tag, "LN") != 0)
262  {
263  // LN is required so must be the first tag added
264  myErrorMessage =
265  "SamFileHeader:Failed to add the specified SQ key, LN not specified.";
266  return(false);
267  }
268 
269  // Add it.
270  sq = new SamHeaderSQ();
271 
272  if(sq == NULL)
273  {
274  // Could not create the header record.
275  myErrorMessage = "SamFileHeader: Failed to allocate a new SQ tag";
276  return(false);
277  }
278 
279  // Created the header record, so add it to the list of SQ lines.
280  mySQs.Add(name, sq);
281  myHeaderRecords.push_back(sq);
282  // value is the length, so update the reference info.
283  myReferenceInfo.add(name, atoi(value));
284 
285  // Add the key tag
286  if(!sq->addKey(name))
287  {
288  // Failed to add the key tag, return false.
289  myErrorMessage = "SamFileHeader:Failed to add the specified SQ key";
290  return(false);
291  }
292  }
293  else if(strcmp(tag, "LN") == 0)
294  {
295  // Cannot modify/remove the LN tag.
296  myErrorMessage = "SamFileHeader:Cannot modify/remove the SQ's LN tag";
297  return(false);
298  }
299 
300  if(!sq->setTag(tag, value))
301  {
302  myErrorMessage = "Failed to set the specified SQ tag";
303  return(false);
304  }
305  return(true);
306 }
307 
308 
309 // Add the specified tag and value to the RG header with the read group
310 // identifier. If the header does not yet exist, the header is added.
311 bool SamFileHeader::setRGTag(const char* tag, const char* value, const char* id)
312 {
313  // Get the RG record for the specified name.
314  SamHeaderRG* rg = getRG(id);
315  if(rg == NULL)
316  {
317  // The RG does not yet exist.
318  // Add it.
319  rg = new SamHeaderRG();
320 
321  if(rg == NULL)
322  {
323  // Could not create the header record.
324  myErrorMessage = "Failed to allocate a new RG tag";
325  return(false);
326  }
327 
328  // Created the header record, so add it to the list of RG lines.
329  myRGs.Add(id, rg);
330  myHeaderRecords.push_back(rg);
331 
332  // Add the key tag
333  if(!rg->addKey(id))
334  {
335  // Failed to add the key tag, return false.
336  myErrorMessage = "Failed to add the specified RG key";
337  return(false);
338  }
339  }
340 
341  if(!rg->setTag(tag, value))
342  {
343  myErrorMessage = "Failed to set the specified RG tag";
344  return(false);
345  }
346  return(true);
347 }
348 
349 
350 // Add the specified tag and value to the PG header with the specified id.
351 // If the header does not yet exist, the header is added.
352 // Add the specified tag and value to the PG header.
353 bool SamFileHeader::setPGTag(const char* tag, const char* value, const char* id)
354 {
355  // Get the PG record for the specified name.
356  SamHeaderPG* pg = getPG(id);
357  if(pg == NULL)
358  {
359  // The PG does not yet exist.
360  // Add it.
361  pg = new SamHeaderPG();
362 
363  if(pg == NULL)
364  {
365  // Could not create the header record.
366  myErrorMessage = "Failed to allocate a new PG tag";
367  return(false);
368  }
369 
370  // Created the header record, so add it to the list of PG lines.
371  myPGs.Add(id, pg);
372  myHeaderRecords.push_back(pg);
373 
374  // Add the key tag
375  if(!pg->addKey(id))
376  {
377  // Failed to add the key tag, return false.
378  myErrorMessage = "Failed to add the specified PG key";
379  return(false);
380  }
381  }
382 
383  if(!pg->setTag(tag, value))
384  {
385  myErrorMessage = "Failed to set the specified PG tag";
386  return(false);
387  }
388  return(true);
389 }
390 
391 
392 // Add the HD record to the header.
394 {
395  // If there is already an HD header or if null
396  // was passed in, return false.
397  if(myHD != NULL)
398  {
399  myErrorMessage = "Failed add an HD tag - there is already one";
400  return(false);
401  }
402  if(hd == NULL)
403  {
404  myErrorMessage = "Failed add an HD tag - no tag specified";
405  return(false);
406  }
407  myHD = hd;
408 
409  myHeaderRecords.push_back(myHD);
410  return(true);
411 }
412 
413 
414 // Add the SQ record to the header.
416 {
417  if(sq == NULL)
418  {
419  // null pointer passed in, can't add it.
420  myErrorMessage = "SAM/BAM Header line failed to allocate SQ.";
421  return(false);
422  }
423  const char* name = sq->getTagValue("SN");
424  const char* length = sq->getTagValue("LN");
425  if(strcmp(name, EMPTY_RETURN.c_str()) == 0)
426  {
427  // SN is not set, so can't add it.
428  myErrorMessage =
429  "SAM/BAM Header line failure: Skipping SQ line that is missing the SN field.";
430  return(false);
431  }
432  if(strcmp(length, EMPTY_RETURN.c_str()) == 0)
433  {
434  // LN is not set, so can't add it.
435  myErrorMessage =
436  "SAM/BAM Header line failure: Skipping SQ line that is missing the LN field.";
437  return(false);
438  }
439 
440  // Determine whether or not a record with this
441  // key is already in the hash.
442  if(mySQs.Find(name) < 0)
443  {
444  // It is not already in the hash so add it.
445  mySQs.Add(name, sq);
446  myHeaderRecords.push_back(sq);
447  myReferenceInfo.add(name, atoi(length));
448  return(true);
449  }
450 
451  // It is already in the hash, so cannot be added.
452  myErrorMessage = "SAM/BAM Header line failure: Skipping SQ line that has a repeated SN field.";
453  return(false);
454 }
455 
456 
457 // Add the RG record to the header.
459 {
460  if(rg == NULL)
461  {
462  // null pointer passed in, can't add it.
463  myErrorMessage = "SAM/BAM Header line failed to allocate RG.";
464  return(false);
465  }
466  const char* id = rg->getTagValue("ID");
467  if(strcmp(id, EMPTY_RETURN.c_str()) == 0)
468  {
469  // ID is not set, so can't add it.
470  myErrorMessage = "SAM/BAM Header line failure: Skipping RG line that is missing the ID field.";
471  return(false);
472  }
473 
474  // Determine whether or not a record with this
475  // key is already in the hash.
476  if(myRGs.Find(id) < 0)
477  {
478  // It is not already in the hash so
479  // add it.
480  myRGs.Add(id, rg);
481  myHeaderRecords.push_back(rg);
482  return(true);
483  }
484 
485  // It is already in the hash, so cannot be added.
486  myErrorMessage = "SAM/BAM Header line failure: Skipping RG line that has a repeated ID field.";
487  return(false);
488 }
489 
490 
491 // Add the PG record to the header.
493 {
494  // If a null pointer was passed in, return false.
495  if(pg == NULL)
496  {
497  myErrorMessage = "SAM/BAM Header line failed to allocate PG.";
498  return(false);
499  }
500  const char* id = pg->getTagValue("ID");
501  if(strcmp(id, EMPTY_RETURN.c_str()) == 0)
502  {
503  // ID is not set, so can't add the header record.
504  myErrorMessage = "SAM/BAM Header line failure: Skipping PG line that is missing the ID field.";
505  return(false);
506  }
507 
508  // Determine whether or not a record with this
509  // key is already in the hash.
510  if(myPGs.Find(id) < 0)
511  {
512  // It is not already in the hash so
513  // add it.
514  myPGs.Add(id, pg);
515  myHeaderRecords.push_back(pg);
516  return(true);
517  }
518 
519  // It is already in the hash, so cannot be added.
520  myErrorMessage = "SAM/BAM Header line failure: Skipping PG line that has a repeated ID field.";
521  return(false);
522 }
523 
524 
525 // Add the RG record to the header.
527 {
528  SamHeaderRecord* newRec = hdrRec.createCopy();
529  bool returnVal = true;
530  switch(newRec->getType())
531  {
532  case SamHeaderRecord::HD:
533  returnVal = addHD((SamHeaderHD*)newRec);
534  break;
535  case SamHeaderRecord::PG:
536  returnVal = addPG((SamHeaderPG*)newRec);
537  break;
538  case SamHeaderRecord::RG:
539  returnVal = addRG((SamHeaderRG*)newRec);
540  break;
541  case SamHeaderRecord::SQ:
542  returnVal = addSQ((SamHeaderSQ*)newRec);
543  break;
544  default:
545  myErrorMessage = "Failed to copy a header record, unknown type.";
546  returnVal = false;
547  break;
548  }
549  return(returnVal);
550 }
551 
552 
553 // Remove the HD record.
555 {
556  if(myHD != NULL)
557  {
558  // Reset the record. Do not delete it since it is in the headerRecords
559  // vector and it is not worth the time to remove it from the middle of
560  // that vector since this is the header and the space does not need
561  // to be conserved.
562  myHD->reset();
563 
564  // Set myHD to null so a new HD could be added.
565  myHD = NULL;
566  }
567 
568  return(true);
569 }
570 
571 
572 // Remove the SQ record associated with the specified name.
573 bool SamFileHeader::removeSQ(const char* name)
574 {
575  // Look up the name in the hash.
576  int hashIndex = mySQs.Find(name);
577  if(hashIndex < 0)
578  {
579  // Not found in the hash, so nothing to
580  // delete, return true it does not exist
581  // in the hash.
582  return(true);
583  }
584 
585  // Get the SQ.
586  SamHeaderSQ* sq = (SamHeaderSQ*)(mySQs.Object(hashIndex));
587 
588  if(sq == NULL)
589  {
590  // sq is null, this is an error since hashIndex was greater than 0,
591  // so it should have been found.
592  myErrorMessage = "SAM/BAM Header line failed to get SQ object.";
593  return(false);
594  }
595 
596  // Reset the record. Do not delete it since it is in the headerRecords
597  // vector and it is not worth the time to remove it from the middle of
598  // that vector since this is the header and the space does not need
599  // to be conserved.
600  sq->reset();
601 
602  // Delete the entry from the hash.
603  mySQs.Delete(hashIndex);
604 
605  return(true);
606 }
607 
608 
609 // Remove the RG record associated with the specified id.
610 bool SamFileHeader::removeRG(const char* id)
611 {
612  // Look up the id in the hash.
613  int hashIndex = myRGs.Find(id);
614  if(hashIndex < 0)
615  {
616  // Not found in the hash, so nothing to
617  // delete, return true it does not exist
618  // in the hash.
619  return(true);
620  }
621 
622  // Get the RG.
623  SamHeaderRG* rg = (SamHeaderRG*)(myRGs.Object(hashIndex));
624 
625  if(rg == NULL)
626  {
627  // rg is null, this is an error since hashIndex was greater than 0,
628  // so it should have been found.
629  myErrorMessage = "SAM/BAM Header line failed to get RG object.";
630  return(false);
631  }
632 
633  // Reset the record. Do not delete it since it is in the headerRecords
634  // vector and it is not worth the time to remove it from the middle of
635  // that vector since this is the header and the space does not need
636  // to be conserved.
637  rg->reset();
638 
639  // Delete the entry from the hash.
640  myRGs.Delete(hashIndex);
641 
642  return(true);
643 }
644 
645 
646 // Remove the PG record associated with the specified id.
647 bool SamFileHeader::removePG(const char* id)
648 {
649  // Look up the id in the hash.
650  int hashIndex = myPGs.Find(id);
651  if(hashIndex < 0)
652  {
653  // Not found in the hash, so nothing to
654  // delete, return true it does not exist
655  // in the hash.
656  return(true);
657  }
658 
659  // Get the PG.
660  SamHeaderPG* pg = (SamHeaderPG*)(myPGs.Object(hashIndex));
661 
662  if(pg == NULL)
663  {
664  // pg is null, this is an error since hashIndex was greater than 0,
665  // so it should have been found.
666  myErrorMessage = "SAM/BAM Header line failed to get PG object.";
667  return(false);
668  }
669 
670  // Reset the record. Do not delete it since it is in the headerRecords
671  // vector and it is not worth the time to remove it from the middle of
672  // that vector since this is the header and the space does not need
673  // to be conserved.
674  pg->reset();
675 
676  // Delete the entry from the hash.
677  myPGs.Delete(hashIndex);
678 
679  return(true);
680 }
681 
682 
683 const char* SamFileHeader::getHDTagValue(const char* tag)
684 {
685  if(myHD == NULL)
686  {
687  // return blank since there is no HD type.
688  return(EMPTY_RETURN.c_str());
689  }
690  return(myHD->getTagValue(tag));
691 }
692 
693 
694 // Get the value associated with the specified tag on the SQ line with
695 // the specified sequence name.
696 const char* SamFileHeader::getSQTagValue(const char* tag, const char* name)
697 {
698  // Look up the name in the hash to get the associated SQ object.
699  SamHeaderSQ* sq = (SamHeaderSQ*)(mySQs.Object(name));
700 
701  // If it is NULL - the tag was not found, so return
702  if(sq == NULL)
703  {
704  return(EMPTY_RETURN.c_str());
705  }
706 
707  // Found the object, so return the SQ Tag.
708  return(sq->getTagValue(tag));
709 }
710 
711 
712 // Get the value associated with the specified tag on the RG line with
713 // the specified read group identifier.
714 const char* SamFileHeader::getRGTagValue(const char* tag, const char* id)
715 {
716  // Look up the id in the hash to get the associated RG object.
717  SamHeaderRG* rg = (SamHeaderRG*)(myRGs.Object(id));
718 
719  // If it is NULL - the tag was not found, so return
720  if(rg == NULL)
721  {
722  return(EMPTY_RETURN.c_str());
723  }
724 
725  // Found the object, so return the RG Tag.
726  return(rg->getTagValue(tag));
727 }
728 
729 
730 const char* SamFileHeader::getPGTagValue(const char* tag, const char* id)
731 {
732  // Look up the id in the hash to get the associated PG object.
733  SamHeaderPG* pg = (SamHeaderPG*)(myPGs.Object(id));
734 
735  // If it is NULL - the tag was not found, so return
736  if(pg == NULL)
737  {
738  return(EMPTY_RETURN.c_str());
739  }
740 
741  // Found the object, so return the PG Tag.
742  return(pg->getTagValue(tag));
743 }
744 
745 
746 // Get the number of SQ objects.
748 {
749  return(mySQs.Entries());
750 }
751 
752 
753 // Get the number of RG objects.
755 {
756  return(myRGs.Entries());
757 }
758 
759 
760 // Get the number of PG objects.
762 {
763  return(myPGs.Entries());
764 }
765 
766 
767 // Get the HD object.
769 {
770  return(myHD);
771 }
772 
773 
774 // Get the SQ object with the specified sequence name.
776 {
777  return((SamHeaderSQ*)(mySQs.Object(name)));
778 }
779 
780 
781 // Get the RG object with the specified read group identifier.
783 {
784  return((SamHeaderRG*)(myRGs.Object(id)));
785 }
786 
787 
788 // Get the PG object.
790 {
791  return((SamHeaderPG*)(myPGs.Object(id)));
792 }
793 
794 
795 // Return the value of the SO tag.
796 // If this field does not exist, EMPTY_RETURN.c_str() is returned.
798 {
799  if(myHD == NULL)
800  {
801  // No HD, so return blank EMPTY_RETURN.c_str()
802  return(EMPTY_RETURN.c_str());
803  }
804  return(myHD->getSortOrder());
805 }
806 
807 
808 // Deprecated way of getting the sort order from the file.
810 {
811  return(getSortOrder());
812 }
813 
814 
815 // Get the next SQ header record. After all SQ headers have been retrieved,
816 // NULL is returned until a reset is called.
818 {
819  return(getNextHeaderRecord(myCurrentSQIndex,
821 }
822 
823 
824 // Get the next RG header record. After all RG headers have been retrieved,
825 // NULL is returned until a reset is called.
827 {
828  return(getNextHeaderRecord(myCurrentRGIndex,
830 }
831 
832 
833 // Get the next PG header record. After all PG headers have been retrieved,
834 // NULL is returned until a reset is called.
836 {
837  return(getNextHeaderRecord(myCurrentPGIndex,
839 }
840 
841 
842 // Reset to the beginning of the header records so the next call
843 // to getNextSQRecord returns the first SQ header record.
845 {
846  myCurrentSQIndex = 0;
847 }
848 
849 
850 // Reset to the beginning of the header records so the next call
851 // to getNextRGRecord returns the first RG header record.
853 {
854  myCurrentRGIndex = 0;
855 }
856 
857 
858 // Reset to the beginning of the header records so the next call
859 // to getNextPGRecord returns the first PG header record.
861 {
862  myCurrentPGIndex = 0;
863 }
864 
865 
866 // Get the next header record of the specified type.
867 // Pass in the index to start looking at and the type to look for.
868 // Update the index.
869 // After all headers of that type have been retrieved,
870 // NULL is returned until a reset is called for that type.
873 {
874  SamHeaderRecord* foundRecord = NULL;
875  // Loop until a record is found or until out of range of the
876  // headerRecord vector.
877  while((index < myHeaderRecords.size())
878  && (foundRecord == NULL))
879  {
880  // Get the next record.
881  foundRecord = myHeaderRecords[index];
882  // Either way, increment the index.
883  ++index;
884  // Check to see if the next record is active.
885  if(!foundRecord->isActiveHeaderRecord())
886  {
887  // Not active, so clear the pointer.
888  foundRecord = NULL;
889  }
890  // Check to see if the record is the right type.
891  else if(foundRecord->getType() != headerType)
892  {
893  // Not the right type, so clear the pointer.
894  foundRecord = NULL;
895  }
896  }
897 
898  // Return the record if it was found. Will be null if none were found.
899  return(foundRecord);
900 }
901 
902 
903 // Get the next header record. After all headers have been retrieved,
904 // NULL is returned until a reset is called. Does not return the
905 // Comment lines.
906 // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
907 // same iterator.
909 {
910  // Get the next header record
911  SamHeaderRecord* foundRecord = NULL;
912  // Loop until a record is found or until out of range of the
913  // headerRecord vector.
914  while((myCurrentHeaderIndex < myHeaderRecords.size())
915  && (foundRecord == NULL))
916  {
917  // Get the next record.
918  foundRecord = myHeaderRecords[myCurrentHeaderIndex];
919  // Either way, increment the index.
920  ++myCurrentHeaderIndex;
921  // Check to see if the next record is active.
922  if(!foundRecord->isActiveHeaderRecord())
923  {
924  // Not active, so clear the pointer.
925  foundRecord = NULL;
926  }
927  }
928 
929  // Return the record if it was found. Will be null if none were found.
930  return(foundRecord);
931 }
932 
933 
934 // Set the passed in string to the next header line. The passed in
935 // string will be overwritten. If there are no more header lines or there
936 // is an error, false is returned and the passed in string is set to EMPTY_RETURN.c_str()
937 // until a rest is called.
938 // Will also return the comment lines.
939 // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
940 // same iterator.
941 bool SamFileHeader::getNextHeaderLine(std::string &headerLine)
942 {
943  headerLine = EMPTY_RETURN.c_str();
944 
945  // Until the header is set, keep reading.
946  // Header could return EMPTY_RETURN.c_str() if the header line is blank.
947  while(headerLine == EMPTY_RETURN.c_str())
948  {
949  if(getHeaderLine(myCurrentHeaderIndex, headerLine) == false)
950  {
951  // getHeaderLine failed, so stop processing, and return false.
952  return(false);
953  }
954  else
955  {
956  // In range, increment the index.
957  ++myCurrentHeaderIndex;
958  }
959  }
960  return(true);
961 }
962 
963 
964 // Reset to the beginning of the header records so the next call
965 // to getNextHeaderRecord returns the first header line.
967 {
968  myCurrentHeaderIndex = 0;
969 }
970 
971 
972 void SamFileHeader::appendCommentLines(std::string &commentLines)
973 {
974  for(unsigned int i = 0; i < myComments.size(); i++)
975  {
976  commentLines += "@CO\t";;
977  commentLines += myComments[i];
978  commentLines += "\n";
979  }
980 }
981 
982 
983 // Returns the comment on the next comment line. Returns EMPTY_RETURN.c_str() if all comment
984 // lines have been returned, until resetCommentIter is called.
986 {
987  if(myCurrentCommentIndex < myComments.size())
988  {
989  return(myComments[myCurrentCommentIndex++].c_str());
990  }
991  // Already gone through all the comments, return EMPTY_RETURN.c_str().
992  return(EMPTY_RETURN.c_str());
993 }
994 
995 
996 // Resets to the beginning of the comments so getNextComment returns
997 // the first comment.
999 {
1000  myCurrentCommentIndex = 0;
1001 }
1002 
1003 
1004 // Parse the header.
1005 bool SamFileHeader::parseHeader(String& header)
1006 {
1007  std::string errorMessage = "";
1008  int numErrors = 0;
1009  int numValid = 0;
1010 
1011  // Split the header into lines.
1012  std::vector<String>* types = header.Split('\n');
1013 
1014  // Loop through each header line, parsing that line.
1015  for(uint32_t index = 0; index < types->size(); index++)
1016  {
1017  // Parse the header line.
1018  if(!parseHeaderLine(types->at(index)))
1019  {
1020  errorMessage += myErrorMessage;
1021  errorMessage += "\n";
1022  ++numErrors;
1023  }
1024  else
1025  {
1026  // valid header line
1027  ++numValid;
1028  }
1029  }
1030 
1031  // Delete the types vector.
1032  delete types;
1033  types = NULL;
1034 
1035  myErrorMessage = errorMessage;
1036  if((numErrors > 0) && (numValid == 0))
1037  {
1038  // Only errors.
1039  std::cerr << numErrors
1040  << " invalid SAM/BAM Header lines were skipped due to:\n"
1041  << errorMessage << std::endl;
1042  return(false);
1043  }
1044  else if(numErrors > 0)
1045  {
1046  // Some valid & some invalid.
1047  // Going to return true, but add note about the invalid lines.
1048  std::cerr << numErrors
1049  << " invalid SAM/BAM Header lines were skipped due to:\n"
1050  << errorMessage << std::endl;
1051  }
1052 
1053  return(true);
1054 }
1055 
1056 
1057 // Parse one line of the header.
1058 bool SamFileHeader::parseHeaderLine(const String& headerLine)
1059 {
1060  // Check if the line starts with @CO.
1061  if((headerLine.Length() >= 4) && (headerLine[0] == '@') &&
1062  (headerLine[1] == 'C') && (headerLine[2] == 'O') &&
1063  (headerLine[3] == '\t'))
1064  {
1065  // Comment line.
1066  String comment = headerLine.SubStr(4);
1067  return(addComment(comment));
1068  }
1069 
1070  StringArray tokens;
1071 
1072  // Split the line by tabs.
1073  tokens.ReplaceColumns(headerLine, '\t');
1074 
1075  if(tokens.Length() < 1)
1076  {
1077  // Nothing on this line, just return true.
1078  return(true);
1079  }
1080 
1081  // Get the header type, the first column.
1082  if((tokens[0].Length() != 3) || (tokens[0][0] != '@'))
1083  {
1084  // The header type string is incorrect. Should be 3 characters
1085  // with the first one @.
1086  myErrorMessage = "SAM/BAM Header line does not start with @ & at least 2 chars.";
1087  return(false);
1088  }
1089 
1090  bool status = true;
1091  if(tokens[0] == "@HD")
1092  {
1093  if(myHD == NULL)
1094  {
1095  // Create a new hd.
1096  myHD = new SamHeaderHD();
1097  if(myHD == NULL)
1098  {
1099  // Failed to allocate HD, so return false.
1100  myErrorMessage = "SAM/BAM Header line failed to allocate HD.";
1101  return(false);
1102  }
1103  myHeaderRecords.push_back(myHD);
1104  if(!myHD->setFields(tokens))
1105  {
1106  myErrorMessage = "SAM/BAM Header line failed to store HD record.";
1107  status = false;
1108  }
1109  }
1110  else
1111  {
1112  // HD already set, so return false.
1113  myErrorMessage = "SAM/BAM Header line failure: multiple HD records.";
1114  status = false;
1115  }
1116  }
1117  else if(tokens[0] == "@SQ")
1118  {
1119  // Create a new SQ record.
1120  SamHeaderSQ* sq = new SamHeaderSQ();
1121 
1122  if(sq->setFields(tokens))
1123  {
1124  // sq fields were properly set, so add it to the list of
1125  // SQ lines.
1126  // myStatus set in the method.
1127  status &= addSQ(sq);
1128  }
1129  else
1130  {
1131  myErrorMessage = "SAM/BAM Header line failed to store SQ record.";
1132  status = false;
1133  }
1134  }
1135  else if(tokens[0] == "@RG")
1136  {
1137  // Create a new RG record.
1138  SamHeaderRG* rg = new SamHeaderRG();
1139 
1140  if(rg->setFields(tokens))
1141  {
1142  // rg fields were properly set, so add it to the list of
1143  // RG lines.
1144  // myStatus set in the method.
1145  status &= addRG(rg);
1146  }
1147  else
1148  {
1149  myErrorMessage = "SAM/BAM Header line failed to store RG record.";
1150  status = false;
1151  }
1152  }
1153  else if(tokens[0] == "@PG")
1154  {
1155  // Create a new PG record.
1156  SamHeaderPG* pg = new SamHeaderPG();
1157 
1158  if(pg->setFields(tokens))
1159  {
1160  // pg fields were properly set, so add it to the list of
1161  // PG lines.
1162  // myStatus set in the method.
1163  status &= addPG(pg);
1164  }
1165  else
1166  {
1167  myErrorMessage = "SAM/BAM Header line failed to store PG record.";
1168  status = false;
1169  }
1170  }
1171  else
1172  {
1173  // Unknown header type.
1174  myErrorMessage =
1175  "SAM/BAM Header line failure: Skipping unknown header type, ";
1176  myErrorMessage += (const char*)(tokens[0]);
1177  status = false;
1178  }
1179  return(status);
1180 }
1181 
1182 
1183 
1184 // Set the passed in string to the header line at the specified index.
1185 // It does NOT clear the current contents of header.
1186 // NOTE: some indexes will return blank if the entry was deleted.
1187 bool SamFileHeader::getHeaderLine(unsigned int index, std::string& header) const
1188 {
1189  // Check to see if the index is in range of the header records vector.
1190  if(index < myHeaderRecords.size())
1191  {
1192  // In range of the header records vector, so get the string for
1193  // that record.
1194  SamHeaderRecord* hdrRec = myHeaderRecords[index];
1195  hdrRec->appendString(header);
1196  return(true);
1197  }
1198  else
1199  {
1200  unsigned int commentIndex = index - myHeaderRecords.size();
1201  // Check to see if it is in range of the comments.
1202  if(commentIndex < myComments.size())
1203  {
1204  // It is in range of the comments, so add the type.
1205  header += "@CO\t";
1206  // Add the comment.
1207  header += myComments[commentIndex];
1208  // Add the new line.
1209  header += "\n";
1210  return(true);
1211  }
1212  }
1213  // Invalid index.
1214  return(false);
1215 }
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:35
SamHeaderPG * getPG(const char *id)
Get the PG object with the specified id, returning NULL if there is no PG object with that key.
bool addPG(SamHeaderPG *pg)
Add the PG record to the header.
const char * getSortOrder()
Return the Sort Order value that is set in the Header, returning "" if this field does not exist.
const char * getSQTagValue(const char *tag, const char *name)
Get the value associated with the specified tag on the SQ line with the specified sequence name,...
SamHeaderSQ * getSQ(const char *name)
Get the SQ object with the specified sequence name, returning NULL if there is no SQ object with that...
SamHeaderHD * getHD()
Get the HD object, returning NULL if there is no HD record.
void resetRGRecordIter()
Reset to the beginning of the header records so the next call to getNextRGRecord returns the first RG...
bool setPGTag(const char *tag, const char *value, const char *id)
Set the specified tag to the specified value in the PG header with the specified id,...
const char * getHDTagValue(const char *tag)
Returns the value associated with the specified HD tag, returning "" if the tag does not exist in the...
bool addRG(SamHeaderRG *rg)
Add the RG record to the header.
bool getNextHeaderLine(std::string &headerLine)
Set the passed in string to the next header line, overwritting the passed in string.
SamHeaderRecord * getNextPGRecord()
Get the next PG header record.
int getReferenceID(const String &referenceName, bool addID=false)
Get the reference ID for the specified reference name (chromosome).
bool removePG(const char *id)
Remove PG record with the specified key.
bool addSQ(SamHeaderSQ *sq)
Add the SQ record to the header.
int getNumSQs()
Get the number of SQ objects.
bool removeRG(const char *id)
Remove RG record with the specified key.
bool setSQTag(const char *tag, const char *value, const char *name)
Set the specified tag to the specified value in the SQ header with the specified name,...
bool addComment(const char *comment)
Add the specified comment to the header (do not include "@CO" or "\n").
int getNumRGs()
Get the number of RG objects.
const char * getTagSO()
DEPRECATED.
SamHeaderRecord * getNextHeaderRecord()
Get the next header record, but not comment line.
bool addHD(SamHeaderHD *hd)
Add the HD record to the header.
const char * getNextComment()
Returns the comment on the next comment line.
const char * getRGTagValue(const char *tag, const char *id)
Get the value associated with the specified tag on the RG line with the specified read group identifi...
const String & getReferenceLabel(int id) const
Return the reference name (chromosome) for the specified reference id.
bool addHeaderLine(const char *type, const char *tag, const char *value)
Add a header line that is just one tag with a const char* value.
const SamReferenceInfo & getReferenceInfo() const
Get the Reference Information.
bool removeHD()
Remove the HD record.
void resetSQRecordIter()
Reset to the beginning of the header records so the next call to getNextSQRecord returns the first SQ...
bool setRGTag(const char *tag, const char *value, const char *id)
Set the specified tag to the specified value in the RG header with the specified id,...
bool getHeaderString(std::string &header) const
Set the passed in string to the entire header string, clearing its current contents.
bool copy(const SamFileHeader &header)
Copy method copies the passed in header into this header.
void appendCommentLines(std::string &commentLines)
Append all of the comment lines to the specified string.
void resetHeaderRecordIter()
Reset to the beginning of the header records so the next call to getNextHeaderRecord returns the firs...
int getNumPGs()
Get the number of PG objects.
SamHeaderRecord * getNextRGRecord()
Get the next RG header record.
SamHeaderRecord * getNextSQRecord()
Get the next SQ header record.
void resetHeader()
Initialize the header.
bool removeSQ(const char *name)
Remove SQ record with the specified key.
const char * getPGTagValue(const char *tag, const char *id)
Get the value associated with the specified tag on the RG line with the specified id,...
bool addHeader(const char *header)
Add a header that is already preformatted in a const char*.
void resetPGRecordIter()
Reset to the beginning of the header records so the next call to getNextPGRecord returns the first PG...
SamHeaderRG * getRG(const char *id)
Get the RG object with the specified read group identifier, returning NULL if there is no RG object w...
bool addRecordCopy(const SamHeaderRecord &hdrRec)
Add a copy of the specified header record to the header.
bool setHDTag(const char *tag, const char *value)
Set the specified tag to the specified value in the HD header, remove the tag by specifying value="".
SamFileHeader & operator=(const SamFileHeader &header)
Overload operator = to copy the passed in header into this header.
void resetCommentIter()
Resets to the beginning of the comments so getNextComment returns the first comment.
This class encapsulates the tag value pairs contained with a SAM Header line with accessors for getti...
const char * getTagValue(const char *tag) const
Return the value associated with the specified tag.
void reset()
Reset this header record to an empty state with no tags.
SamHeaderRecordType getType()
Return the type of this header record (HD, SQ, RG, or PG) as an enum.
bool setFields(const StringArray &tokens)
Set the fields from the passed in line.
bool isActiveHeaderRecord()
This record is active (true) if there is at least one tag set.
bool appendString(std::string &header)
Appends the string representation of this header record to the passed in string.
virtual SamHeaderRecord * createCopy() const =0
Return a pointer to a newly created header record of the appropriate type that is a copy of this reco...
bool addKey(const char *value)
Add the key tag with the specified value (not for HD headers).
SamHeaderRecordType
Specifies the Type for the sam header record (line).
@ SQ
Sequence Dictionary.
@ RG
Read Group.
bool setTag(const char *tag, const char *value)
Set the value of the specified tag to the specified value, deletes the tag when value is NULL.
Class for tracking the reference information mapping between the reference ids and the reference name...
void clear()
Reset this reference info.
int getReferenceID(const String &referenceName, bool addID=false)
Get the reference ID for the specified name, if addID is set to true, a reference id will be created ...
const String & getReferenceLabel(int id) const
Get the reference name for the specified id, if the id is not found, return "*".
void add(const char *referenceSequenceName, int32_t referenceSequenceLength)
Add reference sequence name and reference sequence length.