libStatGen Software  1
GenomeSequence_test.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <gtest/gtest.h>
19 
20 #include "GenomeSequence.h"
21 
22 const char* RM_BS_REFERENCE = "rm -f ./phiX-bs.umfa";
23 const char* RM_CS_REFERENCE = "rm -f ./phiX-cs.umfa";
24 const char* REFERENCE_NAME = "./phiX.fa";
25 
26 TEST(GenomeSequenceTest, staticLookupTest)
27 {
29  // quick sanity check...
30  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'A']], 'A');
31  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'a']], 'A');
32  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'T']], 'T');
33  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 't']], 'T');
34  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'C']], 'C');
35  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'c']], 'C');
36  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'G']], 'G');
37  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'g']], 'G');
38  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'N']], 'N');
39  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'n']], 'N');
40  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'M']], 'M');
41  EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'm']], 'M');
42 
43  EXPECT_EQ(GenomeSequence::base2int[(int) 'N'], 4);
44  EXPECT_EQ(GenomeSequence::base2int[(int) 'n'], 4);
45  EXPECT_EQ(GenomeSequence::base2int[(int) 'A'], 0);
46  EXPECT_EQ(GenomeSequence::base2int[(int) 'a'], 0);
47  EXPECT_EQ(GenomeSequence::base2int[(int) 'T'], 3);
48  EXPECT_EQ(GenomeSequence::base2int[(int) 't'], 3);
49  EXPECT_EQ(GenomeSequence::base2int[(int) 'C'], 1);
50  EXPECT_EQ(GenomeSequence::base2int[(int) 'c'], 1);
51  EXPECT_EQ(GenomeSequence::base2int[(int) 'G'], 2);
52  EXPECT_EQ(GenomeSequence::base2int[(int) 'g'], 2);
53 }
54 
55 
56 TEST(GenomeSequenceTest, testBaseSpaceReference)
57 {
59  int exitCode = system(RM_BS_REFERENCE);
60  EXPECT_EQ(exitCode, 0);
61 
62  s.setReferenceName(REFERENCE_NAME);
63  bool rc = s.create(false);
64  EXPECT_EQ(rc, false);
65  EXPECT_EQ(s[0], 'G');
66  EXPECT_EQ(s[1], 'A');
67  EXPECT_EQ(s[2], 'G');
68  EXPECT_EQ(s[s.getNumberBases()-3], 'G');
69  EXPECT_EQ(s[s.getNumberBases()-2], 'C');
70  EXPECT_EQ(s[s.getNumberBases()-1], 'A');
71  EXPECT_EQ(s[s.getNumberBases()], 'N'); // check bounds checker
72 
73  s.close();
74 }
75 
76 TEST(GenomeSequenceTest, testColorSpaceReference)
77 {
79  int exitCode = system(RM_CS_REFERENCE);
80  EXPECT_EQ(exitCode, 0);
81 
82  s.setReferenceName(REFERENCE_NAME);
83  bool rc = s.create(true);
84 
85  // NB: I did not calculate these expected values, I just
86  // read them from the converted genome and set them here.
87  // So in theory, they should be checked by hand to ensure
88  // that they are correct.
89  EXPECT_EQ(rc, false);
90  EXPECT_EQ(s[0], 'N'); // in color space, first symbol is unknown
91  EXPECT_EQ(s[1], '2');
92  EXPECT_EQ(s[2], '2');
93  EXPECT_EQ(s[s.getNumberBases()-3], '1');
94  EXPECT_EQ(s[s.getNumberBases()-2], '3');
95  EXPECT_EQ(s[s.getNumberBases()-1], '1');
96  EXPECT_EQ(s[s.getNumberBases()], 'N'); // check bounds checker
97 
98  s.close();
99 }
100 
101 #if 0
102 void simplestExample(void)
103 {
104  GenomeSequence reference;
105  genomeIndex_t index;
106 
107  // a particular reference is set by:
108  // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa")
109  //
110  // In the above example, the suffix .fa is stripped and replaced with .umfa,
111  // which contains the actual file being opened.
112  //
113  if (reference.open()) {
114  perror("GenomeSequence::open");
115  exit(1);
116  }
117 
118 
119  index = 1000000000; // 10^9
120  //
121  // Write the base at the given index. Here, index is 0 based,
122  // and is across the whole genome, as all chromosomes are sequentially
123  // concatenated, so the allowed range is
124  //
125  // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last))
126  //
127  // (where int last = reference.getChromosomeCount() - 1;)
128  //
129  std::cout << "base[" << index << "] = " << reference[index] << std::endl;
130 
131  //
132  // Example for finding chromosome and one based chromosome position given
133  // and absolute position on the genome in 'index':
134  //
135  int chr = reference.getChromosome(index);
136  genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1; // 1-based
137 
138  std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
139 
140  //
141  // Example for finding an absolute genome index position when the
142  // chromosome name and one based position are known:
143  //
144  const char *chromosomeName = "5";
145  chr = reference.getChromosome(chromosomeName); // 0-based
146  chrIndex = 100000; // 1-based
147 
148  index = reference.getChromosomeStart(chr) + chrIndex - 1;
149 
150  std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
151 
152  reference.close();
153 }
154 
155 void testGenomeSequence(void)
156 {
157  GenomeSequence reference;
158 
159 #if 0
160  std::string referenceName = "someotherreference";
161  if (reference.setFastaName(referenceName)) {
162  std::cerr << "failed to open reference file "
163  << referenceName
164  << std::endl;
165  exit(1);
166  }
167 #endif
168 
169  std::cerr << "open and prefetch the reference genome: ";
170 
171  // open it
172  if (reference.open()) {
173  exit(1);
174  }
175  std::cerr << "done!" << std::endl;
176 
177  //
178  // For the human genome, genomeIndex ranges from 0 to 3.2x10^9
179  //
180  genomeIndex_t genomeIndex; // 0 based
181  unsigned int chromosomeIndex; // 1 based
182  unsigned int chromosome; // 0..23 or so
183  std::string chromosomeName;
184 
185  //
186  // Here we'll start with a chromosome name, then obtain the genome
187  // index, and use it to find the base we want:
188  //
189  chromosomeName = "2";
190  chromosomeIndex = 1234567;
191  // this call is slow (string search for chromsomeName):
192  genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
193  assert(genomeIndex!=INVALID_GENOME_INDEX);
194  std::cout << "Chromosome " << chromosomeName << ", index ";
195  std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
196  std::cout << " at genome index position " << genomeIndex << std::endl;
197 
198  //
199  // now reverse it - given a genomeIndex from above, find the chromosome
200  // name and index:
201  //
202 
203  // slow (binary search on genomeIndex):
204  chromosome = reference.getChromosome(genomeIndex);
205  unsigned int newChromosomeIndex;
206  // not slow:
207  newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
208 
209  assert(chromosomeIndex == newChromosomeIndex);
210 
211  // more testing... at least test and use PackedRead:
212  //
213  PackedRead pr;
214 
215  pr.set("ATCGATCG", 0);
216  assert(pr.size()==8);
217  assert(pr[0]==GenomeSequence::base2int[(int) 'A']);
218  assert(pr[1]==GenomeSequence::base2int[(int) 'T']);
219  assert(pr[2]==GenomeSequence::base2int[(int) 'C']);
220  assert(pr[3]==GenomeSequence::base2int[(int) 'G']);
221  pr.set("ATCGATCG", 1);
222  assert(pr.size()==9);
223  pr.set("", 0);
224  assert(pr.size()==0);
225  pr.set("", 1);
226  assert(pr.size()==1);
227  pr.set("", 2);
228  assert(pr.size()==2);
229  pr.set("", 3);
230  assert(pr.size()==3);
231  assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
232  assert(pr[1]==GenomeSequence::base2int[(int) 'N']);
233  assert(pr[2]==GenomeSequence::base2int[(int) 'N']);
234  pr.set("C", 1);
235  assert(pr.size()==2);
236  assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
237  assert(pr[1]==GenomeSequence::base2int[(int) 'C']);
238 
239 }
240 
241 #endif
GenomeSequence::getChromosomeStart
genomeIndex_t getChromosomeStart(int chromosomeIndex) const
given a chromosome, return the genome base it starts in
Definition: GenomeSequence.h:246
GenomeSequence
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Definition: GenomeSequence.h:100
PackedRead
Definition: GenomeSequenceHelpers.h:142
GenomeSequence::getChromosome
int getChromosome(genomeIndex_t position) const
given a whole genome index, get the chromosome it is located in
Definition: GenomeSequence.cpp:737
GenomeSequence::setReferenceName
bool setReferenceName(std::string referenceFilename)
set the reference name that will be used in open()
Definition: GenomeSequence.cpp:254
GenomeSequence::open
bool open(bool isColorSpace=false, int flags=O_RDONLY)
open the reference specified using GenomeSequence::setReferenceName
Definition: GenomeSequence.cpp:182
GenomeSequence::getGenomePosition
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
Definition: GenomeSequence.cpp:779
GenomeSequence::getNumberBases
genomeIndex_t getNumberBases() const
return the number of bases represented in this reference
Definition: GenomeSequence.h:216