libStatGen Software  1
Tabix.cpp
1 /*
2  * Copyright (C) 2012-2013 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 #include "Tabix.h"
18 #include <stdexcept>
19 #include "StringBasics.h"
20 
21 Tabix::Tabix()
22  : IndexBase(),
23  myChromNamesBuffer(NULL)
24 {
25 }
26 
27 
28 Tabix::~Tabix()
29 {
30  if(myChromNamesBuffer != NULL)
31  {
32  delete[] myChromNamesBuffer;
33  myChromNamesBuffer = NULL;
34  }
35 }
36 
37 
38 // Reset the member data for a new index file.
40 {
42  if(myChromNamesBuffer != NULL)
43  {
44  delete[] myChromNamesBuffer;
45  myChromNamesBuffer = NULL;
46  }
47  myChromNamesVector.clear();
48 }
49 
50 
51 // Read & parse the specified index file.
53 {
54  // Reset the index from anything that may previously be set.
55  resetIndex();
56 
57  IFILE indexFile = ifopen(filename, "rb");
58 
59  // Failed to open the index file.
60  if(indexFile == NULL)
61  {
62  return(StatGenStatus::FAIL_IO);
63  }
64 
65  // read the tabix index structure.
66 
67  // Read the magic string.
68  char magic[4];
69  if(ifread(indexFile, magic, 4) != 4)
70  {
71  // Failed to read the magic
72  return(StatGenStatus::FAIL_IO);
73  }
74 
75  // If this is not an index file, set num references to 0.
76  if (magic[0] != 'T' || magic[1] != 'B' || magic[2] != 'I' || magic[3] != 1)
77  {
78  // Not a Tabix Index file.
80  }
81 
82  // It is a tabix index file.
83  // Read the number of reference sequences.
84  if(ifread(indexFile, &n_ref, 4) != 4)
85  {
86  // Failed to read.
87  return(StatGenStatus::FAIL_IO);
88  }
89 
90  // Size the references.
91  myRefs.resize(n_ref);
92 
93  // Read the Format configuration.
94  if(ifread(indexFile, &myFormat, sizeof(myFormat)) != sizeof(myFormat))
95  {
96  // Failed to read.
97  return(StatGenStatus::FAIL_IO);
98  }
99 
100  // Read the length of the chromosome names.
101  uint32_t l_nm;
102 
103  if(ifread(indexFile, &l_nm, sizeof(l_nm)) != sizeof(l_nm))
104  {
105  // Failed to read.
106  return(StatGenStatus::FAIL_IO);
107  }
108 
109  // Read the chromosome names.
110  myChromNamesBuffer = new char[l_nm];
111  if(ifread(indexFile, myChromNamesBuffer, l_nm) != l_nm)
112  {
113  return(StatGenStatus::FAIL_IO);
114  }
115  myChromNamesVector.resize(n_ref);
116 
117  // Parse out the chromosome names.
118  bool prevNull = true;
119  int chromIndex = 0;
120  for(uint32_t i = 0; i < l_nm; i++)
121  {
122  if(chromIndex >= n_ref)
123  {
124  // already set the pointer for the last chromosome name,
125  // so stop looping.
126  break;
127  }
128  if(prevNull == true)
129  {
130  myChromNamesVector[chromIndex++] = myChromNamesBuffer + i;
131  prevNull = false;
132  }
133  if(myChromNamesBuffer[i] == '\0')
134  {
135  prevNull = true;
136  }
137  }
138 
139  for(int refIndex = 0; refIndex < n_ref; refIndex++)
140  {
141  // Read each reference.
142  Reference* ref = &(myRefs[refIndex]);
143 
144  // Read the number of bins.
145  if(ifread(indexFile, &(ref->n_bin), 4) != 4)
146  {
147  // Failed to read the number of bins.
148  // Return failure.
150  }
151 
152  // Resize the bins.
153  ref->bins.resize(ref->n_bin + 1);
154 
155  // Read each bin.
156  for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
157  {
158  uint32_t binNumber;
159 
160  // Read in the bin number.
161  if(ifread(indexFile, &(binNumber), 4) != 4)
162  {
163  // Failed to read the bin number.
164  // Return failure.
165  return(StatGenStatus::FAIL_IO);
166  }
167 
168  // Add the bin to the reference and get the
169  // pointer back so the values can be set in it.
170  Bin* binPtr = &(ref->bins[binIndex]);
171  binPtr->bin = binNumber;
172 
173  // Read in the number of chunks.
174  if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
175  {
176  // Failed to read number of chunks.
177  // Return failure.
178  return(StatGenStatus::FAIL_IO);
179  }
180 
181  // Read in the chunks.
182  // Allocate space for the chunks.
183  uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
184  binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
185  if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
186  {
187  // Failed to read the chunks.
188  // Return failure.
189  return(StatGenStatus::FAIL_IO);
190  }
191  }
192 
193  // Read the number of intervals.
194  if(ifread(indexFile, &(ref->n_intv), 4) != 4)
195  {
196  // Failed to read, set to 0.
197  ref->n_intv = 0;
198  // Return failure.
199  return(StatGenStatus::FAIL_IO);
200  }
201 
202  // Allocate space for the intervals and read them.
203  uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
204  ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
205  if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
206  {
207  // Failed to read the linear index.
208  // Return failure.
209  return(StatGenStatus::FAIL_IO);
210  }
211  }
212 
213  // Successfully read teh bam index file.
214  return(StatGenStatus::SUCCESS);
215 }
216 
217 
218 bool Tabix::getStartPos(const char* refName, int32_t start,
219  uint64_t& fileStartPos) const
220 {
221  // Look for the reference name in the list.
222  int refID = 0;
223  for(refID = 0; refID < n_ref; refID++)
224  {
225  if(strcmp(refName, myChromNamesVector[refID]) == 0)
226  {
227  // found the reference
228  break;
229  }
230  }
231  if(refID >= n_ref)
232  {
233  // Didn't find the refName, so return false.
234  return(false);
235  }
236 
237  // Look up in the linear index.
238  if(start < 0)
239  {
240  // Negative index, so start at 0.
241  start = 0;
242  }
243  return(getMinOffsetFromLinearIndex(refID, start, fileStartPos));
244 }
245 
246 
247 const char* Tabix::getRefName(unsigned int indexNum) const
248 {
249  if(indexNum >= myChromNamesVector.size())
250  {
251  String message = "ERROR: Out of range on Tabix::getRefName(";
252  message += indexNum;
253  message += ")";
254  throw(std::runtime_error(message.c_str()));
255  return(NULL);
256  }
257  return(myChromNamesVector[indexNum]);
258 }
unsigned int ifread(IFILE file, void *buffer, unsigned int size)
Read up to size bytes from the file into the buffer.
Definition: InputFile.h:600
method failed due to an I/O issue.
Definition: StatGenStatus.h:37
method completed successfully.
Definition: StatGenStatus.h:32
failed to parse a record/header - invalid format.
Definition: StatGenStatus.h:42
bool getStartPos(const char *refName, int32_t start, uint64_t &fileStartPos) const
Get the starting file offset to look for the specified start position.
Definition: Tabix.cpp:218
Class for easily reading/writing files without having to worry about file type (uncompressed, gzip, bgzf) when reading.
Definition: InputFile.h:36
const char * getRefName(unsigned int indexNum) const
Return the reference name at the specified index or throws an exception if out of range...
Definition: Tabix.cpp:247
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout...
Definition: InputFile.h:562
virtual void resetIndex()
Reset the member data for a new index file.
Definition: IndexBase.cpp:130
StatGenStatus::Status readIndex(const char *filename)
Definition: Tabix.cpp:52
void resetIndex()
Reset the member data for a new index file.
Definition: Tabix.cpp:39
Status
Return value enum for StatGenFile methods.
Definition: StatGenStatus.h:31