libStatGen Software  1
GreedyTupleAligner.h
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef _GREEDY_TUPLE_H
19 #define _GREEDY_TUPLE_H
20 
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <assert.h>
25 #include <ctype.h>
26 #include "Generic.h"
27 #include "CigarRoller.h"
28 
29 /*
30  *
31 
32  TODO:
33  1. how to efficiently find insertion?
34 
35 */
36 
37 /**
38  * Weight includes various penalties(e.g. gap open) used in local alignment
39  */
40 struct Weight
41 {
42 public:
43  Weight()
44  {
45  gapOpen = gapExtend = -1; // here we do not use affine gap penalty for simlicity.
46  mismatch = -1;
47  match= 2;
48  };
49  int gapOpen;
50  int gapExtend;
51  int mismatch;
52  int match;
53 };
54 
55 //
56 // tuple number is 3, arbitrary number from my guess!
57 // another reason
58 //
59 template <typename QueryType, typename ReferenceType, typename ReferenceIndex>
61 {
62 public:
63  GreedyTupleAligner(Weight& wt): weight(wt)
64  {/* */}
65 
66  /**
67  * Match 'query' to the 'reference' from 'searchStartIndex' up
68  * to 'searchSize', store matched length to 'matchedLength'
69  * and number of mismatch to 'mismatch'
70  * @param query input query
71  * @param queryLength length of query
72  * @param reference reference sequence
73  * @param searchStartIndex the positino where search starts
74  * @param searchSize the total length in reference sequence that will be examine
75  * @param matchedLength store how many bases are matched
76  * @param mismatch store how many bases are mismatched
77  * @return -1 for unsuccess return
78  */
80  const QueryType query,
81  const int queryLength,
82  const ReferenceType reference,
83  const ReferenceIndex searchStartIndex,
84  const int searchSize,
85  int& matchedLength,
86  int& mismatch)
87  {
88  // now use naive search,
89  // TODO: will incorportate KMP serach later
90  // TODO: adjust tolerance of mismatches
91  const int MAX_MISMATCH=2;
92  int bestPos = 0, bestMismatch = queryLength, bestMatchedLength = 0, bestScore=-1;
93 
94 #if defined(DEBUG_GREEDY_ALIGNER)
95  cout << "searchStartIndex == " << searchStartIndex << ", searchSize == " << searchSize << std::endl;
96 #endif
97  // here i is the matching position (inclusive)
98  // j is the matched length
99  for (int i = 0; i <= searchSize - tupleSize; i++)
100  {
101  int j = 0;
102  mismatch = 0;
103  while (j < queryLength)
104  {
105  if (searchStartIndex + i + j >= reference.getNumberBases())
106  break;
107  if (query[j] != reference[searchStartIndex + i + j])
108  {
109  mismatch++;
110  if (mismatch >= MAX_MISMATCH)
111  break;
112  }
113  j++;
114  }
115 
116  if (j>0 && (j==queryLength)) j--;
117 
118  while (searchStartIndex +i +j < reference.getNumberBases()
119  && ((j+1) > mismatch)
120  && mismatch>0
121  && query[j] != reference[searchStartIndex + i+j])
122  {
123  // if pattern matching goes beyong the preset mismatch cutoff,
124  // we will have to go backwards
125  j--;
126  mismatch--;
127  }
128 
129  int score = j - mismatch;
130 
131  if (score > bestScore)
132  {
133  bestPos = i;
134  bestScore = score;
135  bestMismatch = mismatch;
136  bestMatchedLength = j+1;
137  }
138  }
139 
140  if (bestScore > 0)
141  {
142  mismatch = bestMismatch;
143  matchedLength = bestMatchedLength;
144  return bestPos;
145  }
146  return -1;
147  }
148 
149  /**
150  * Core local alignment algorithm
151  * @param query input query
152  * @param queryLength length of query
153  * @param reference reference genome
154  * @param searchStartIndex matching starts here
155  * @param searchSize how far we will search
156  * @param cigarRoller store alignment results here
157  * @param matchPosition store match position
158  */
159  void Align(
160  QueryType query,
161  int queryLength,
162  ReferenceType reference,
163  ReferenceIndex searchStartIndex,
164  int searchSize,
165  CigarRoller& cigarRoller,
166  ReferenceIndex& matchPosition)
167  {
168  // Algorithm:
169  // finished align? (should we try different align position?)
170  // if not, try next tuple
171  // is the tuple aligned?
172  // yes, extend to previous, mark unmatched part mismatch or gap
173  // extend to next matched part
174  int r1 = 0; // a start index: reference starting from r1 (inclusive) will be used
175  int queryMatchCount = 0; // query matched # of bases
176  int q1 = 0; // to align
177  int pos = -1;
178  int lastR1 = -1; // index: record last
179 
180  cigarRoller.clear();
181  matchPosition = -1;
182 
183  while (queryMatchCount < queryLength)
184  {
185  if (r1 == searchSize - 1) // touched ref right boundary
186  {
187  cigarRoller.Add(CigarRoller::softClip, queryLength-queryMatchCount);
188  break;
189  }
190  if (queryLength - q1 < tupleSize)
191  {
192  // XXX this needs to do something more sane
193  // printf("some bases left!\n");
194  // a simple fix: treat all left-over bases as mismatches/matches
195  cigarRoller.Add(CigarRoller::mismatch, queryLength - queryMatchCount);
196  break;
197  }
198  int mismatch = 0;
199  int matchedLen = 0;
200  if ((pos = MatchTuple(query+q1, queryLength-q1, reference, searchStartIndex + r1, searchSize - r1, matchedLen, mismatch)) // found match position for tuple
201 
202  >= 0)
203  {
204  // found match position for tuple
205 
206  if (lastR1<0)
207  matchPosition = pos;
208 
209  //
210  // deal with left
211  //
212  if (lastR1>=0) // have previously aligned part of the query to the reference genome yet
213  {
214  if (pos > 0)
215  {
216  cigarRoller.Add(CigarRoller::del, pos);
217  }
218  }
219  else
220  {
221  lastR1 = pos;
222  }
223 
224  r1 += pos;
225  r1 += matchedLen;
226  q1 += matchedLen;
227 
228  //
229  // deal with right
230  //
231  cigarRoller.Add(CigarRoller::match, matchedLen);
232  queryMatchCount = q1;
233  lastR1 = r1;
234 
235  continue;
236  } // end if
237 
238  //
239  // try insertion
240  // maximum insert ? say 2
241  //
242  for (int i = 1; i < queryLength - q1 - tupleSize; i++)
243  {
244  int mismatch = 0;
245  int matchedLen = 0;
246  // check reference genome broundary
247  if (searchStartIndex + r1 >= reference.getNumberBases())
248  return;
249  if ((pos = MatchTuple(query+q1 + i ,
250  queryLength - q1 -i ,
251  reference,
252  searchStartIndex + r1,
253  searchSize - r1,
254  matchedLen,
255  mismatch)) // found match position for tuple
256  >= 0)
257  {
258  if (matchPosition < 0)
259  matchPosition = pos + q1 + i ;
260  }
261  queryMatchCount += i;
262  q1 += i;
263  cigarRoller.Add(CigarRoller::insert, i);
264 
265  lastR1 = r1 + pos;
266  r1 += pos + tupleSize;
267  q1 += tupleSize;
268 
269  // deal with right
270  while (searchStartIndex + r1 < reference.getNumberBases()
271  && query[q1]==reference[searchStartIndex + r1]
272  && q1 < queryLength)
273  {
274  r1++;
275  q1++;
276  }
277  if (q1 < queryLength)
278  {
279  cigarRoller.Add(CigarRoller::match, q1 - queryMatchCount);
280  queryMatchCount = q1;
281  }
282  else
283  {
284  cigarRoller.Add(CigarRoller::match, queryLength - queryMatchCount);
285  queryMatchCount = queryLength ;
286  break ;
287  }
288  }
289 
290  r1++;
291  q1++;
292 
293  // try next
294  } // end while (queryMatchCount < queryLength)
295  }
296 private:
297  static const int tupleSize = 3;
298  Weight weight;
299 };
300 
301 
302 #endif
CigarRoller::Add
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
Definition: CigarRoller.cpp:77
GreedyTupleAligner::MatchTuple
int MatchTuple(const QueryType query, const int queryLength, const ReferenceType reference, const ReferenceIndex searchStartIndex, const int searchSize, int &matchedLength, int &mismatch)
Match 'query' to the 'reference' from 'searchStartIndex' up to 'searchSize', store matched length to ...
Definition: GreedyTupleAligner.h:79
CigarRoller::clear
void clear()
Clear this object so that it has no Cigar Operations.
Definition: CigarRoller.cpp:325
Cigar::insert
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
Weight
Weight includes various penalties(e.g.
Definition: GreedyTupleAligner.h:41
GreedyTupleAligner::Align
void Align(QueryType query, int queryLength, ReferenceType reference, ReferenceIndex searchStartIndex, int searchSize, CigarRoller &cigarRoller, ReferenceIndex &matchPosition)
Core local alignment algorithm.
Definition: GreedyTupleAligner.h:159
Cigar::match
@ match
match/mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:89
Cigar::softClip
@ softClip
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)....
Definition: Cigar.h:94
Cigar::mismatch
@ mismatch
mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:90
Cigar::del
@ del
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition: Cigar.h:92
CigarRoller
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition: CigarRoller.h:67
GreedyTupleAligner
Definition: GreedyTupleAligner.h:61