libStatGen Software 1
GreedyTupleAligner.h
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef _GREEDY_TUPLE_H
19#define _GREEDY_TUPLE_H
20
21#include <stdio.h>
22#include <stdlib.h>
23#include <string.h>
24#include <assert.h>
25#include <ctype.h>
26#include "Generic.h"
27#include "CigarRoller.h"
28
29/*
30 *
31
32 TODO:
33 1. how to efficiently find insertion?
34
35*/
36
37/**
38 * Weight includes various penalties(e.g. gap open) used in local alignment
39 */
40struct Weight
41{
42public:
43 Weight()
44 {
45 gapOpen = gapExtend = -1; // here we do not use affine gap penalty for simlicity.
46 mismatch = -1;
47 match= 2;
48 };
49 int gapOpen;
50 int gapExtend;
51 int mismatch;
52 int match;
53};
54
55//
56// tuple number is 3, arbitrary number from my guess!
57// another reason
58//
59template <typename QueryType, typename ReferenceType, typename ReferenceIndex>
61{
62public:
63 GreedyTupleAligner(Weight& wt): weight(wt)
64 {/* */}
65
66 /**
67 * Match 'query' to the 'reference' from 'searchStartIndex' up
68 * to 'searchSize', store matched length to 'matchedLength'
69 * and number of mismatch to 'mismatch'
70 * @param query input query
71 * @param queryLength length of query
72 * @param reference reference sequence
73 * @param searchStartIndex the positino where search starts
74 * @param searchSize the total length in reference sequence that will be examine
75 * @param matchedLength store how many bases are matched
76 * @param mismatch store how many bases are mismatched
77 * @return -1 for unsuccess return
78 */
80 const QueryType query,
81 const int queryLength,
82 const ReferenceType reference,
83 const ReferenceIndex searchStartIndex,
84 const int searchSize,
85 int& matchedLength,
86 int& mismatch)
87 {
88 // now use naive search,
89 // TODO: will incorportate KMP serach later
90 // TODO: adjust tolerance of mismatches
91 const int MAX_MISMATCH=2;
92 int bestPos = 0, bestMismatch = queryLength, bestMatchedLength = 0, bestScore=-1;
93
94#if defined(DEBUG_GREEDY_ALIGNER)
95 cout << "searchStartIndex == " << searchStartIndex << ", searchSize == " << searchSize << std::endl;
96#endif
97 // here i is the matching position (inclusive)
98 // j is the matched length
99 for (int i = 0; i <= searchSize - tupleSize; i++)
100 {
101 int j = 0;
102 mismatch = 0;
103 while (j < queryLength)
104 {
105 if (searchStartIndex + i + j >= reference.getNumberBases())
106 break;
107 if (query[j] != reference[searchStartIndex + i + j])
108 {
109 mismatch++;
110 if (mismatch >= MAX_MISMATCH)
111 break;
112 }
113 j++;
114 }
115
116 if (j>0 && (j==queryLength)) j--;
117
118 while (searchStartIndex +i +j < reference.getNumberBases()
119 && ((j+1) > mismatch)
120 && mismatch>0
121 && query[j] != reference[searchStartIndex + i+j])
122 {
123 // if pattern matching goes beyong the preset mismatch cutoff,
124 // we will have to go backwards
125 j--;
126 mismatch--;
127 }
128
129 int score = j - mismatch;
130
131 if (score > bestScore)
132 {
133 bestPos = i;
134 bestScore = score;
135 bestMismatch = mismatch;
136 bestMatchedLength = j+1;
137 }
138 }
139
140 if (bestScore > 0)
141 {
142 mismatch = bestMismatch;
143 matchedLength = bestMatchedLength;
144 return bestPos;
145 }
146 return -1;
147 }
148
149 /**
150 * Core local alignment algorithm
151 * @param query input query
152 * @param queryLength length of query
153 * @param reference reference genome
154 * @param searchStartIndex matching starts here
155 * @param searchSize how far we will search
156 * @param cigarRoller store alignment results here
157 * @param matchPosition store match position
158 */
159 void Align(
160 QueryType query,
161 int queryLength,
162 ReferenceType reference,
163 ReferenceIndex searchStartIndex,
164 int searchSize,
165 CigarRoller& cigarRoller,
166 ReferenceIndex& matchPosition)
167 {
168 // Algorithm:
169 // finished align? (should we try different align position?)
170 // if not, try next tuple
171 // is the tuple aligned?
172 // yes, extend to previous, mark unmatched part mismatch or gap
173 // extend to next matched part
174 int r1 = 0; // a start index: reference starting from r1 (inclusive) will be used
175 int queryMatchCount = 0; // query matched # of bases
176 int q1 = 0; // to align
177 int pos = -1;
178 int lastR1 = -1; // index: record last
179
180 cigarRoller.clear();
181 matchPosition = -1;
182
183 while (queryMatchCount < queryLength)
184 {
185 if (r1 == searchSize - 1) // touched ref right boundary
186 {
187 cigarRoller.Add(CigarRoller::softClip, queryLength-queryMatchCount);
188 break;
189 }
190 if (queryLength - q1 < tupleSize)
191 {
192 // XXX this needs to do something more sane
193 // printf("some bases left!\n");
194 // a simple fix: treat all left-over bases as mismatches/matches
195 cigarRoller.Add(CigarRoller::mismatch, queryLength - queryMatchCount);
196 break;
197 }
198 int mismatch = 0;
199 int matchedLen = 0;
200 if ((pos = MatchTuple(query+q1, queryLength-q1, reference, searchStartIndex + r1, searchSize - r1, matchedLen, mismatch)) // found match position for tuple
201
202 >= 0)
203 {
204 // found match position for tuple
205
206 if (lastR1<0)
207 matchPosition = pos;
208
209 //
210 // deal with left
211 //
212 if (lastR1>=0) // have previously aligned part of the query to the reference genome yet
213 {
214 if (pos > 0)
215 {
216 cigarRoller.Add(CigarRoller::del, pos);
217 }
218 }
219 else
220 {
221 lastR1 = pos;
222 }
223
224 r1 += pos;
225 r1 += matchedLen;
226 q1 += matchedLen;
227
228 //
229 // deal with right
230 //
231 cigarRoller.Add(CigarRoller::match, matchedLen);
232 queryMatchCount = q1;
233 lastR1 = r1;
234
235 continue;
236 } // end if
237
238 //
239 // try insertion
240 // maximum insert ? say 2
241 //
242 for (int i = 1; i < queryLength - q1 - tupleSize; i++)
243 {
244 int mismatch = 0;
245 int matchedLen = 0;
246 // check reference genome broundary
247 if (searchStartIndex + r1 >= reference.getNumberBases())
248 return;
249 if ((pos = MatchTuple(query+q1 + i ,
250 queryLength - q1 -i ,
251 reference,
252 searchStartIndex + r1,
253 searchSize - r1,
254 matchedLen,
255 mismatch)) // found match position for tuple
256 >= 0)
257 {
258 if (matchPosition < 0)
259 matchPosition = pos + q1 + i ;
260 }
261 queryMatchCount += i;
262 q1 += i;
263 cigarRoller.Add(CigarRoller::insert, i);
264
265 lastR1 = r1 + pos;
266 r1 += pos + tupleSize;
267 q1 += tupleSize;
268
269 // deal with right
270 while (searchStartIndex + r1 < reference.getNumberBases()
271 && query[q1]==reference[searchStartIndex + r1]
272 && q1 < queryLength)
273 {
274 r1++;
275 q1++;
276 }
277 if (q1 < queryLength)
278 {
279 cigarRoller.Add(CigarRoller::match, q1 - queryMatchCount);
280 queryMatchCount = q1;
281 }
282 else
283 {
284 cigarRoller.Add(CigarRoller::match, queryLength - queryMatchCount);
285 queryMatchCount = queryLength ;
286 break ;
287 }
288 }
289
290 r1++;
291 q1++;
292
293 // try next
294 } // end while (queryMatchCount < queryLength)
295 }
296private:
297 static const int tupleSize = 3;
298 Weight weight;
299};
300
301
302#endif
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition: CigarRoller.h:67
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
Definition: CigarRoller.cpp:77
void clear()
Clear this object so that it has no Cigar Operations.
@ del
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition: Cigar.h:92
@ mismatch
mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:90
@ match
match/mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:89
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
@ softClip
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)....
Definition: Cigar.h:94
int MatchTuple(const QueryType query, const int queryLength, const ReferenceType reference, const ReferenceIndex searchStartIndex, const int searchSize, int &matchedLength, int &mismatch)
Match 'query' to the 'reference' from 'searchStartIndex' up to 'searchSize', store matched length to ...
void Align(QueryType query, int queryLength, ReferenceType reference, ReferenceIndex searchStartIndex, int searchSize, CigarRoller &cigarRoller, ReferenceIndex &matchPosition)
Core local alignment algorithm.
Weight includes various penalties(e.g.