libStatGen Software  1
CigarRoller.cpp
1 /*
2  * Copyright (C) 2010-2011 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <ctype.h>
21 #include "CigarRoller.h"
22 
23 ////////////////////////////////////////////////////////////////////////
24 //
25 // Cigar Roller Class
26 //
27 
28 
30 {
31  std::vector<CigarOperator>::iterator i;
32  for (i = rhs.cigarOperations.begin(); i != rhs.cigarOperations.end(); i++)
33  {
34  (*this) += *i;
35  }
36  return *this;
37 }
38 
39 
40 //
41 // Append a new operator at the end of the sequence.
42 //
44 {
45  // Adding to the cigar, so the query & reference indexes would be
46  // incomplete, so just clear them.
47  clearQueryAndReferenceIndexes();
48 
49  if (rhs.count==0)
50  {
51  // nothing to do
52  }
53  else if (cigarOperations.empty() || cigarOperations.back() != rhs)
54  {
55  cigarOperations.push_back(rhs);
56  }
57  else
58  {
59  // last stored operation is the same as the new one, so just add it in
60  cigarOperations.back().count += rhs.count;
61  }
62  return *this;
63 }
64 
65 
67 {
68  clear();
69 
70  (*this) += rhs;
71 
72  return *this;
73 }
74 
75 
76 //
77 void CigarRoller::Add(Operation operation, int count)
78 {
79  CigarOperator rhs(operation, count);
80  (*this) += rhs;
81 }
82 
83 
84 void CigarRoller::Add(char operation, int count)
85 {
86  switch (operation)
87  {
88  case 0:
89  case 'M':
90  Add(match, count);
91  break;
92  case 1:
93  case 'I':
94  Add(insert, count);
95  break;
96  case 2:
97  case 'D':
98  Add(del, count);
99  break;
100  case 3:
101  case 'N':
102  Add(skip, count);
103  break;
104  case 4:
105  case 'S':
106  Add(softClip, count);
107  break;
108  case 5:
109  case 'H':
110  Add(hardClip, count);
111  break;
112  case 6:
113  case 'P':
114  Add(pad, count);
115  break;
116  case 7:
117  case '=':
118  Add(match, count);
119  break;
120  case 8:
121  case 'X':
122  Add(match, count);
123  break;
124  default:
125  // Hmmm... what to do?
126  std::cerr << "ERROR "
127  << "(" << __FILE__ << ":" << __LINE__ <<"): "
128  << "Parsing CIGAR - invalid character found "
129  << "with parameter " << operation << " and " << count
130  << std::endl;
131  break;
132  }
133 }
134 
135 
136 void CigarRoller::Add(const char *cigarString)
137 {
138  int operationCount = 0;
139  while (*cigarString)
140  {
141  if (isdigit(*cigarString))
142  {
143  char *endPtr;
144  operationCount = strtol((char *) cigarString, &endPtr, 10);
145  cigarString = endPtr;
146  }
147  else
148  {
149  Add(*cigarString, operationCount);
150  cigarString++;
151  }
152  }
153 }
154 
155 
156 bool CigarRoller::Remove(int index)
157 {
158  if((index < 0) || ((unsigned int)index >= cigarOperations.size()))
159  {
160  // can't remove, out of range, return false.
161  return(false);
162  }
163  cigarOperations.erase(cigarOperations.begin() + index);
164  // Modifying the cigar, so the query & reference indexes are out of date,
165  // so clear them.
166  clearQueryAndReferenceIndexes();
167  return(true);
168 }
169 
170 
171 bool CigarRoller::IncrementCount(int index, int increment)
172 {
173  if((index < 0) || ((unsigned int)index >= cigarOperations.size()))
174  {
175  // can't update, out of range, return false.
176  return(false);
177  }
178  cigarOperations[index].count += increment;
179 
180  // Modifying the cigar, so the query & reference indexes are out of date,
181  // so clear them.
182  clearQueryAndReferenceIndexes();
183  return(true);
184 }
185 
186 
187 bool CigarRoller::Update(int index, Operation op, int count)
188 {
189  if((index < 0) || ((unsigned int)index >= cigarOperations.size()))
190  {
191  // can't update, out of range, return false.
192  return(false);
193  }
194  cigarOperations[index].operation = op;
195  cigarOperations[index].count = count;
196 
197  // Modifying the cigar, so the query & reference indexes are out of date,
198  // so clear them.
199  clearQueryAndReferenceIndexes();
200  return(true);
201 }
202 
203 
204 void CigarRoller::Set(const char *cigarString)
205 {
206  clear();
207  Add(cigarString);
208 }
209 
210 
211 void CigarRoller::Set(const uint32_t* cigarBuffer, uint16_t bufferLen)
212 {
213  clear();
214 
215  // Parse the buffer.
216  for (int i = 0; i < bufferLen; i++)
217  {
218  int opLen = cigarBuffer[i] >> 4;
219 
220  Add(cigarBuffer[i] & 0xF, opLen);
221  }
222 }
223 
224 
225 //
226 // when we examine CIGAR strings, we need to know how
227 // many cumulative insert and delete positions there are
228 // so that we can adjust the read location appropriately.
229 //
230 // Here, we iterate over the vector of CIGAR operations,
231 // summaring the count for each insert or delete (insert
232 // increases the offset, delete decreases it).
233 //
234 // The use case for this is when we have a genome match
235 // position based on an index word other than the first one,
236 // and there is also a insert or delete between the beginning
237 // of the read and the index word. We can't simply report
238 // the match position without taking into account the indels,
239 // otherwise we'll be off by N where N is the sum of this
240 // indel count.
241 //
242 // DEPRECATED - do not use. There are better ways to accomplish that by using
243 // read lengths, reference lengths, span of the read, etc.
245 {
246  int offset = 0;
247  std::vector<CigarOperator>::iterator i;
248 
249  for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
250  {
251  switch (i->operation)
252  {
253  case insert:
254  offset += i->count;
255  break;
256  case del:
257  offset -= i->count;
258  break;
259  // TODO anything for case skip:????
260  default:
261  break;
262  }
263  }
264  return offset;
265 }
266 
267 
268 //
269 // Get the string reprentation of the Cigar operations in this object.
270 // Caller must delete the returned value.
271 //
273 {
274  // NB: the exact size of the string is not important, it just needs to be guaranteed
275  // larger than the largest number of characters we could put into it.
276 
277  // we do not explicitly manage memory usage, and we expect when program exits, the memory used here will be freed
278  static char *ret = NULL;
279  static unsigned int retSize = 0;
280 
281  if (ret == NULL)
282  {
283  retSize = cigarOperations.size() * 12 + 1; // 12 == a magic number -> > 1 + log base 10 of MAXINT
284  ret = (char*) malloc(sizeof(char) * retSize);
285  assert(ret != NULL);
286 
287  }
288  else
289  {
290  // currently, ret pointer has enough memory to use
291  if (retSize > cigarOperations.size() * 12 + 1)
292  {
293  }
294  else
295  {
296  retSize = cigarOperations.size() * 12 + 1;
297  free(ret);
298  ret = (char*) malloc(sizeof(char) * retSize);
299  }
300  assert(ret != NULL);
301  }
302 
303  char *ptr = ret;
304  char buf[12]; // > 1 + log base 10 of MAXINT
305 
306  std::vector<CigarOperator>::iterator i;
307 
308  // Progressively append the character representations of the operations to
309  // the cigar string we allocated above.
310 
311  *ptr = '\0'; // clear result string
312  for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
313  {
314  sprintf(buf, "%d%c", (*i).count, (*i).getChar());
315  strcat(ptr, buf);
316  while (*ptr)
317  {
318  ptr++; // limit the cost of strcat above
319  }
320  }
321  return ret;
322 }
323 
324 
326 {
327  // Clearing the cigar, so the query & reference indexes are out of
328  // date, so clear them.
329  clearQueryAndReferenceIndexes();
330  cigarOperations.clear();
331 }
CigarRoller::Add
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
Definition: CigarRoller.cpp:77
Cigar::CigarOperator
Definition: Cigar.h:108
Cigar::Operation
Operation
Enum for the cigar operations.
Definition: Cigar.h:87
CigarRoller::clear
void clear()
Clear this object so that it has no Cigar Operations.
Definition: CigarRoller.cpp:325
CigarRoller::Set
void Set(const char *cigarString)
Sets this object to the specified cigarString.
Definition: CigarRoller.cpp:204
Cigar::pad
@ pad
Padding (not in reference or query). Associated with CIGAR Operation "P".
Definition: Cigar.h:96
Cigar::insert
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
Cigar::hardClip
@ hardClip
Hard clip on the read (clipped sequence not present in the query sequence or reference)....
Definition: Cigar.h:95
CigarRoller::Remove
bool Remove(int index)
Remove the operation at the specified index.
Definition: CigarRoller.cpp:156
CigarRoller::Update
bool Update(int index, Operation op, int count)
Updates the operation at the specified index to be the specified operation and have the specified cou...
Definition: CigarRoller.cpp:187
Cigar::match
@ match
match/mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:89
Cigar::skip
@ skip
skipped region from the reference (the reference contains bases that have no corresponding base in th...
Definition: Cigar.h:93
Cigar::softClip
@ softClip
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)....
Definition: Cigar.h:94
CigarRoller::getMatchPositionOffset
int getMatchPositionOffset()
DEPRECATED - do not use, there are better ways to accomplish that by using read lengths,...
Definition: CigarRoller.cpp:244
CigarRoller::getString
const char * getString()
Get the string reprentation of the Cigar operations in this object, caller must delete the returned v...
Definition: CigarRoller.cpp:272
CigarRoller::operator=
CigarRoller & operator=(CigarRoller &rhs)
Set this object to be equal to the specified CigarRoller.
Definition: CigarRoller.cpp:66
Cigar::del
@ del
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition: Cigar.h:92
CigarRoller
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition: CigarRoller.h:67
CigarRoller::operator+=
CigarRoller & operator+=(CigarRoller &rhs)
Add the contents of the specified CigarRoller to this object.
Definition: CigarRoller.cpp:29
CigarRoller::IncrementCount
bool IncrementCount(int index, int increment)
Increments the count for the operation at the specified index by the specified value,...
Definition: CigarRoller.cpp:171