libStatGen Software  1
SamInterface.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "SamInterface.h"
19 #include "SamRecordHelper.h"
20 
21 #include <limits>
22 #include <stdint.h>
23 
24 SamInterface::SamInterface()
25  : myFirstRecord("")
26 {
27 }
28 
29 
30 SamInterface::~SamInterface()
31 {
32 }
33 
34 
35 // Read a SAM file's header.
36 bool SamInterface::readHeader(IFILE filePtr, SamFileHeader& header,
37  SamStatus& status)
38 {
39  if(filePtr == NULL)
40  {
41  // File is not open.
43  "Cannot read header since the file pointer is null");
44  return(false);
45  }
46 
47  // Clear the passed in header.
48  header.resetHeader();
49 
50  int numValid = 0;
51  int numInvalid = 0;
52  std::string errorMessages = "";
53 
54  do {
55  StringIntHash tags;
56  StringArray values;
57  buffer.ReadLine(filePtr);
58 
59  // Stop reading header lines if at the end of the file or
60  // if the line is not blank and does not start with an @.
61  if ( ifeof(filePtr) ||
62  ((buffer.Length() != 0) && (buffer[0] != '@')) )
63  {
64  break;
65  }
66 
67  // This is a header line, so add it to header.
68  if(header.addHeaderLine(buffer.c_str()))
69  {
70  if(buffer.Length() != 0)
71  {
72  ++numValid;
73  }
74  }
75  else
76  {
77  ++numInvalid;
78  // Failed reading the header.
79  errorMessages += header.getErrorMessage();
80  // Skip further processing on this line since it was an error.
81  continue;
82  }
83  } while (1);
84 
85  // Store the first record since it was read.
86  myFirstRecord = buffer;
87 
88  if(numInvalid > 0)
89  {
90  if(numValid == 0)
91  {
92  std::cerr << "Failed to parse " << numInvalid << " header lines";
93  std::cerr << ". No valid header lines.\n";
94  status.setStatus(SamStatus::FAIL_PARSE, errorMessages.c_str());
95  return(false);
96  }
97  }
98 
99  // Successfully read.
100  return(true);
101 }
102 
103 bool SamInterface::writeHeader(IFILE filePtr, SamFileHeader& header,
104  SamStatus& status)
105 {
106  if((filePtr == NULL) || (filePtr->isOpen() == false))
107  {
108  // File is not open, return failure.
110  "Cannot write header since the file pointer is null");
111  return(false);
112  }
113 
114  ////////////////////////////////
115  // Write the header to the file.
116  ////////////////////////////////
117  // Construct a string containing the entire header.
118  std::string headerString = "";
119  header.getHeaderString(headerString);
120 
121  int32_t headerLen = headerString.length();
122  int numWrite = 0;
123 
124  // Write the header to the file.
125  numWrite = ifwrite(filePtr, headerString.c_str(), headerLen);
126  if(numWrite != headerLen)
127  {
129  "Failed to write the SAM header.");
130  return(false);
131  }
132  return(true);
133 }
134 
135 
136 void SamInterface::readRecord(IFILE filePtr, SamFileHeader& header,
137  SamRecord& record,
138  SamStatus& samStatus)
139 {
140  // Initialize the status to success - will be set to false on failure.
141  samStatus = SamStatus::SUCCESS;
142 
143  if((filePtr == NULL) || (filePtr->isOpen() == false))
144  {
145  // File is not open.
146  samStatus.addError(SamStatus::FAIL_ORDER,
147  "filePtr does not point to an open file.");
148  return;
149  }
150 
151  // If the first record has been set, use that and clear it,
152  // otherwise read the record from the file.
153  if(myFirstRecord.Length() != 0)
154  {
155  buffer = myFirstRecord;
156  myFirstRecord.Clear();
157  }
158  else
159  {
160  // Read the next record.
161  buffer.Clear();
162  buffer.ReadLine(filePtr);
163  // If the end of the file and nothing was read, return false.
164  if ((ifeof(filePtr)) && (buffer.Length() == 0))
165  {
166  // end of the file and nothing to process.
168  "No more records in the file.");
169  return;
170  }
171  }
172 
173  tokens.ReplaceColumns(buffer, '\t');
174 
175 
176  // Error string for reporting a parsing failure.
177  String errorString = "";
178 
179  if (tokens.Length() < 11)
180  {
181  errorString = "Too few columns (";
182  errorString += tokens.Length();
183  errorString += ") in the Record, expected at least 11.";
185  errorString.c_str());
186  return;
187  }
188 
189  // Reset the record before setting any fields.
190  record.resetRecord();
191 
192  if(!record.setReadName(tokens[0]))
193  {
194  samStatus.addError(record.getStatus());
195  }
196 
197  long flagInt = 0;
198  if(!tokens[1].AsInteger(flagInt))
199  {
200  errorString = "flag, ";
201  errorString += tokens[1].c_str();
202  errorString += ", is not an integer.";
204  errorString.c_str());
205  }
206  else if((flagInt < 0) || (flagInt > UINT16_MAX))
207  {
208  errorString = "flag, ";
209  errorString += tokens[1].c_str();
210  errorString += ", is not between 0 and (2^16)-1 = 65535.";
212  errorString.c_str());
213  }
214  else if(!record.setFlag(flagInt))
215  {
216  samStatus.addError(record.getStatus().getStatus(),
217  record.getStatus().getStatusMessage());
218  }
219 
220  if(!record.setReferenceName(header, tokens[2]))
221  {
222  samStatus.addError(record.getStatus().getStatus(),
223  record.getStatus().getStatusMessage());
224  }
225 
226  long posInt = 0;
227  if(!tokens[3].AsInteger(posInt))
228  {
229  errorString = "position, ";
230  errorString += tokens[3].c_str();
231  errorString += ", is not an integer.";
233  errorString.c_str());
234  }
235  else if((posInt < INT32_MIN) || (posInt > INT32_MAX))
236  {
237  // If it is not in this range, it cannot fit into a 32 bit int.
238  errorString = "position, ";
239  errorString += tokens[3].c_str();
240  errorString += ", does not fit in a 32 bit signed int.";
242  errorString.c_str());
243  }
244  else if(!record.set1BasedPosition(posInt))
245  {
246  samStatus.addError(record.getStatus().getStatus(),
247  record.getStatus().getStatusMessage());
248  }
249 
250  long mapInt = 0;
251  if(!tokens[4].AsInteger(mapInt))
252  {
253  errorString = "map quality, ";
254  errorString += tokens[4].c_str();
255  errorString += ", is not an integer.";
257  errorString.c_str());
258  }
259  else if((mapInt < 0) || (mapInt > UINT8_MAX))
260  {
261  errorString = "map quality, ";
262  errorString += tokens[4].c_str();
263  errorString += ", is not between 0 and (2^8)-1 = 255.";
265  errorString.c_str());
266  }
267  else if(!record.setMapQuality(mapInt))
268  {
269  samStatus.addError(record.getStatus().getStatus(),
270  record.getStatus().getStatusMessage());
271  }
272 
273  if(!record.setCigar(tokens[5]))
274  {
275  samStatus.addError(record.getStatus().getStatus(),
276  record.getStatus().getStatusMessage());
277  }
278 
279  if(!record.setMateReferenceName(header, tokens[6]))
280  {
281  samStatus.addError(record.getStatus().getStatus(),
282  record.getStatus().getStatusMessage());
283  }
284 
285  long matePosInt = 0;
286  if(!tokens[7].AsInteger(matePosInt))
287  {
288  errorString = "mate position, ";
289  errorString += tokens[7].c_str();
290  errorString += ", is not an integer.";
292  errorString.c_str());
293  }
294  else if(!record.set1BasedMatePosition(matePosInt))
295  {
296  samStatus.addError(record.getStatus().getStatus(),
297  record.getStatus().getStatusMessage());
298  }
299 
300  long insertInt = 0;
301  if(!tokens[8].AsInteger(insertInt))
302  {
303  errorString = "insert size, ";
304  errorString += tokens[8].c_str();
305  errorString += ", is not an integer.";
307  errorString.c_str());
308  }
309  else if(!record.setInsertSize(insertInt))
310  {
311  samStatus.addError(record.getStatus().getStatus(),
312  record.getStatus().getStatusMessage());
313  }
314 
315  if(!record.setSequence(tokens[9]))
316  {
317  samStatus.addError(record.getStatus().getStatus(),
318  record.getStatus().getStatusMessage());
319  }
320 
321  if(!record.setQuality(tokens[10]))
322  {
323  samStatus.addError(record.getStatus().getStatus(),
324  record.getStatus().getStatusMessage());
325  }
326 
327  // Clear the tag fields.
328  record.clearTags();
329 
330  // Add the tags to the record.
331  for (int i = 11; i < tokens.Length(); i++)
332  {
333  String & nugget = tokens[i];
334 
335  if (nugget.Length() < 6 || nugget[2] != ':' || nugget[4] != ':')
336  {
337  // invalid tag format.
338  errorString = "Invalid Tag Format: ";
339  errorString += nugget.c_str();
340  errorString += ", should be cc:c:x*.";
342  errorString.c_str());
343  continue;
344  }
345 
346  // Valid tag format.
347  // Add the tag.
348  if(!record.addTag((const char *)nugget, nugget[3],
349  (const char *)nugget + 5))
350  {
351  samStatus.addError(record.getStatus().getStatus(),
352  record.getStatus().getStatusMessage());
353  }
354  }
355 
356  return;
357 }
358 
359 
360 SamStatus::Status SamInterface::writeRecord(IFILE filePtr,
361  SamFileHeader& header,
362  SamRecord& record,
363  SamRecord::SequenceTranslation translation)
364 {
365  // Store all the fields into a string, then write the string.
366  String recordString = record.getReadName();
367  recordString += "\t";
368  recordString += record.getFlag();
369  recordString += "\t";
370  recordString += record.getReferenceName();
371  recordString += "\t";
372  recordString += record.get1BasedPosition();
373  recordString += "\t";
374  recordString += record.getMapQuality();
375  recordString += "\t";
376  recordString += record.getCigar();
377  recordString += "\t";
378  recordString += record.getMateReferenceNameOrEqual();
379  recordString += "\t";
380  recordString += record.get1BasedMatePosition();
381  recordString += "\t";
382  recordString += record.getInsertSize();
383  recordString += "\t";
384  recordString += record.getSequence(translation);
385  recordString += "\t";
386  recordString += record.getQuality();
387 
388  // If there are any tags, add a preceding tab.
389  if(record.getTagLength() != 0)
390  {
391  recordString += "\t";
392  SamRecordHelper::genSamTagsString(record, recordString);
393  }
394 
395  recordString += "\n";
396 
397 
398  // Write the record.
399  ifwrite(filePtr, recordString.c_str(), recordString.Length());
400  return(SamStatus::SUCCESS);
401 }
402 
403 
404 void SamInterface::ParseHeaderLine(StringIntHash & tags, StringArray & values)
405 {
406  tags.Clear();
407  values.Clear();
408 
409  tokens.AddColumns(buffer, '\t');
410 
411  for (int i = 1; i < tokens.Length(); i++)
412  {
413  tags.Add(tokens[i].Left(2), i - 1);
414  values.Push(tokens[i].SubStr(3));
415  }
416 }
417 
418 
419 bool SamInterface::isEOF(IFILE filePtr)
420 {
421 
422  if(myFirstRecord.Length() != 0)
423  {
424  // First record is set, so return false, not EOF, since we
425  // know the record still needs to be processed.
426  return(false);
427  }
428  return(GenericSamInterface::isEOF(filePtr));
429 }
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition: InputFile.h:654
unsigned int ifwrite(IFILE file, const void *buffer, unsigned int size)
Write the specified number of bytes from the specified buffer into the file.
Definition: InputFile.h:669
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
bool isOpen() const
Returns whether or not the file was successfully opened.
Definition: InputFile.h:423
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:35
bool addHeaderLine(const char *type, const char *tag, const char *value)
Add a header line that is just one tag with a const char* value.
bool getHeaderString(std::string &header) const
Set the passed in string to the entire header string, clearing its current contents.
void resetHeader()
Initialize the header.
const char * getErrorMessage()
Get the failure message if a method returned failure.
static bool genSamTagsString(SamRecord &record, String &returnString, char delim='\t')
Helper to append the SAM string representation of all the tags to the specified string.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition: SamRecord.h:52
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1298
SequenceTranslation
Enum containing the settings on how to translate the sequence if a reference is available.
Definition: SamRecord.h:57
bool setReadName(const char *readName)
Set QNAME to the passed in name.
Definition: SamRecord.cpp:193
int32_t getInsertSize()
Get the inferred insert size of the read pair (ISIZE) or observed template length (TLEN).
Definition: SamRecord.cpp:1459
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
Definition: SamRecord.cpp:1312
void clearTags()
Clear the tags in this record.
Definition: SamRecord.cpp:977
bool setInsertSize(int32_t insertSize)
Sets the inferred insert size (ISIZE)/observed template length (TLEN).
Definition: SamRecord.cpp:336
uint32_t getTagLength()
Returns the length of the BAM formatted tags.
Definition: SamRecord.cpp:1929
bool setMateReferenceName(SamFileHeader &header, const char *mateReferenceName)
Set the mate/next fragment's reference sequence name (RNEXT) to the specified name,...
Definition: SamRecord.cpp:297
const char * getMateReferenceNameOrEqual()
Get the mate/next fragment's reference sequence name (RNEXT), returning "=" if it is the same as the ...
Definition: SamRecord.cpp:1420
bool setMapQuality(uint8_t mapQuality)
Set the mapping quality (MAPQ).
Definition: SamRecord.cpp:251
bool addTag(const char *tag, char vtype, const char *value)
Add the specified tag,vtype,value to the record.
Definition: SamRecord.cpp:791
void resetRecord()
Reset the fields of the record to a default value.
Definition: SamRecord.cpp:91
bool setFlag(uint16_t flag)
Set the bitwise FLAG to the specified value.
Definition: SamRecord.cpp:215
bool set1BasedPosition(int32_t position)
Set the leftmost position (POS) using the specified 1-based (SAM format) value.
Definition: SamRecord.cpp:236
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1384
int32_t get1BasedMatePosition()
Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT).
Definition: SamRecord.cpp:1445
const SamStatus & getStatus()
Returns the status associated with the last method that sets the status.
Definition: SamRecord.cpp:2403
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
Definition: SamRecord.cpp:259
bool setSequence(const char *seq)
Sets the sequence (SEQ) to the specified SAM formatted sequence string.
Definition: SamRecord.cpp:344
bool set1BasedMatePosition(int32_t matePosition)
Set the mate/next fragment's leftmost position (PNEXT) using the specified 1-based (SAM format) value...
Definition: SamRecord.cpp:322
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1555
uint8_t getMapQuality()
Get the mapping quality (MAPQ) of the record.
Definition: SamRecord.cpp:1340
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
Definition: SamRecord.cpp:1542
bool setQuality(const char *quality)
Sets the quality (QUAL) to the specified SAM formatted quality string.
Definition: SamRecord.cpp:357
bool setReferenceName(SamFileHeader &header, const char *referenceName)
Set the reference sequence name (RNAME) to the specified name, using the header to determine the refe...
Definition: SamRecord.cpp:223
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
Definition: SamRecord.cpp:1638
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
Definition: SamRecord.cpp:1568
This class is used to track the status results of some methods in the BAM classes.
Definition: StatGenStatus.h:27
const char * getStatusMessage() const
Return the status message for this object.
Status
Return value enum for StatGenFile methods.
Definition: StatGenStatus.h:32
@ NO_MORE_RECS
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...
Definition: StatGenStatus.h:36
@ SUCCESS
method completed successfully.
Definition: StatGenStatus.h:32
@ FAIL_IO
method failed due to an I/O issue.
Definition: StatGenStatus.h:37
@ FAIL_PARSE
failed to parse a record/header - invalid format.
Definition: StatGenStatus.h:42
@ FAIL_ORDER
FAIL_ORDER: method failed because it was called out of order, like trying to read a file without open...
Definition: StatGenStatus.h:41
void setStatus(Status newStatus, const char *newMessage)
Set the status with the specified status enum and message.
Status getStatus() const
Return the enum for this status object.
void addError(Status newStatus, const char *newMessage)
Add the specified error message to the status message, setting the status to newStatus if the current...