libStatGen Software 1
IndexBase.cpp
1/*
2 * Copyright (C) 2010-2012 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include "IndexBase.h"
19#include <iomanip>
20
21Chunk SortedChunkList::pop()
22{
23 Chunk newChunk = chunkList.begin()->second;
24 chunkList.erase(chunkList.begin());
25 return(newChunk);
26}
27
28
29bool SortedChunkList::insert(const Chunk& chunkToInsert)
30{
31 std::pair<std::map<uint64_t, Chunk>::iterator, bool> insertRes;
32 // Insert the passed in chunk.
33 insertRes =
34 chunkList.insert(std::pair<uint64_t, Chunk>(chunkToInsert.chunk_beg,
35 chunkToInsert));
36
37 if(!insertRes.second)
38 {
39 // Failed to insert the chunk.
40 std::cerr << "Failed to insert into the SortedChunkList.\n";
41 std::cerr << "\tpreviously found chunk:\tbeg = " << std::hex
42 << insertRes.first->second.chunk_beg
43 << "\tend = "
44 << insertRes.first->second.chunk_end
45 << "\nnew chunk:\tbeg = " << std::hex
46 << chunkToInsert.chunk_beg
47 << "\tend = "
48 << chunkToInsert.chunk_end
49 << std::endl;
50 }
51 // return the result that comes from insertRes.
52 return(insertRes.second);
53}
54
55void SortedChunkList::clear()
56{
57 chunkList.clear();
58}
59
60bool SortedChunkList::empty()
61{
62 return(chunkList.empty());
63}
64
65
66// Merge overlapping chunks found in this list.
67bool SortedChunkList::mergeOverlapping()
68{
69 // Start at the beginning of the list and iterate through.
70 std::map<uint64_t, Chunk>::iterator currentPos = chunkList.begin();
71 std::map<uint64_t, Chunk>::iterator nextPos = chunkList.begin();
72 if(nextPos != chunkList.end())
73 {
74 ++nextPos;
75 }
76
77 // Loop until the end is reached.
78 while(nextPos != chunkList.end())
79 {
80 // If the next chunk is completely contained within the current
81 // chunk (its end is less than the current chunk's end),
82 // delete it since its position is already covered.
83 if(nextPos->second.chunk_end < currentPos->second.chunk_end)
84 {
85 chunkList.erase(nextPos);
86 nextPos = currentPos;
87 ++nextPos;
88 continue;
89 }
90
91 // If the next chunk's start position's BGZF block is less than or
92 // equal to the BGZF block of the current chunk's end position,
93 // combine the two chunks into the current chunk.
94 if((nextPos->second.chunk_beg >> 16) <=
95 (currentPos->second.chunk_end >> 16))
96 {
97 currentPos->second.chunk_end = nextPos->second.chunk_end;
98 // nextPos has now been included in the current pos, so
99 // remove it.
100 chunkList.erase(nextPos);
101 nextPos = currentPos;
102 ++nextPos;
103 continue;
104 }
105 else
106 {
107 // Nothing to combine. So try combining at the next
108 currentPos = nextPos;
109 ++nextPos;
110 }
111 }
112 return(true);
113}
114
115
116IndexBase::IndexBase()
117 : n_ref(0)
118{
119 myRefs.clear();
120}
121
122
123
124IndexBase::~IndexBase()
125{
126}
127
128
129// Reset the member data for a new index file.
131{
132 n_ref = 0;
133 // Clear the references.
134 myRefs.clear();
135}
136
137
138// Get the number of references in this index.
140{
141 // Return the number of references.
142 return(myRefs.size());
143}
144
145
146// The basic logic is from samtools reg2bins and the samtools format specification pdf.
147// Set bins in the region to 1 and all other bins to 0.
148void IndexBase::getBinsForRegion(uint32_t start, uint32_t end, bool binMap[MAX_NUM_BINS+1])
149{
150 for(uint32_t index = 0; index < MAX_NUM_BINS+1; index++)
151 {
152 binMap[index] = false;
153 }
154
155 uint32_t binNum = 0;
156 --end;
157
158 // Check if beg/end go too high, set to max position.
159 if(start > MAX_POSITION)
160 {
161 start = MAX_POSITION;
162 }
163 if(end > MAX_POSITION)
164 {
165 end = MAX_POSITION;
166 }
167
168 // Turn on bins.
169 binMap[binNum] = true;
170 for (binNum = 1 + (start>>26); binNum <= 1 + (end>>26); ++binNum)
171 binMap[binNum] = true;
172 for (binNum = 9 + (start>>23); binNum <= 9 + (end>>23); ++binNum)
173 binMap[binNum] = true;
174 for (binNum = 73 + (start>>20); binNum <= 73 + (end>>20); ++binNum)
175 binMap[binNum] = true;
176 for (binNum = 585 + (start>>17); binNum <= 585 + (end>>17); ++binNum)
177 binMap[binNum] = true;
178 for (binNum = 4681 + (start>>14); binNum <= 4681 + (end>>14); ++binNum)
179 binMap[binNum] = true;
180}
181
182
183// Returns the minimum offset of records that cross the 16K block that
184// contains the specified position for the given reference id.
185bool IndexBase::getMinOffsetFromLinearIndex(int32_t refID, uint32_t position,
186 uint64_t& minOffset) const
187{
188 int32_t linearIndex = position >> LINEAR_INDEX_SHIFT;
189
190 minOffset = 0;
191
192 if(refID > n_ref)
193 {
194 // out of range of the references, return false.
195 return(false);
196 }
197 // Check to see if the position is out of range of the linear index.
198 int32_t linearOffsetSize = myRefs[refID].n_intv;
199
200 // If there are no entries in the linear index, return false.
201 // Or if the linear index is not large enough to include
202 // the start block, then there can be no records that cross
203 // our region, so return false.
204 if((linearOffsetSize == 0) || (linearIndex >= linearOffsetSize))
205
206 {
207 return(false);
208 }
209
210 // The linear index is specified for this block, so return that value.
211 minOffset = myRefs[refID].ioffsets[linearIndex];
212
213 // If the offset is 0, go to the previous block that has an offset.
214 // This is due to a couple of bugs in older sam tools indexes.
215 // 1) they add one to the index location (so when reading those, you
216 // may be starting earlier than necessary)
217 // 2) (the bigger issue) They did not include bins 4681-37449 in
218 // the linear index.
219 while((minOffset == 0) && (--linearIndex >= 0))
220 {
221 minOffset = myRefs[refID].ioffsets[linearIndex];
222 }
223
224
225 // If the minOffset is still 0 when moving forward,
226 // check later indices to find a non-zero since we don't want to return
227 // an offset of 0 since the record can't start at 0 we want to at least
228 // return the first record position for this reference.
229 linearIndex = 0;
230 while((minOffset == 0) && (linearIndex < linearOffsetSize))
231 {
232 minOffset = myRefs[refID].ioffsets[linearIndex];
233 linearIndex++;
234 }
235 if(minOffset == 0)
236 {
237 // Could not find a valid start position for this reference.
238 return(false);
239 }
240 return(true);
241}
int32_t getNumRefs() const
Get the number of references in this index.
Definition: IndexBase.cpp:139
virtual void resetIndex()
Reset the member data for a new index file.
Definition: IndexBase.cpp:130