libStatGen Software 1
GenomeSequence_test.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <gtest/gtest.h>
19
20#include "GenomeSequence.h"
21
22const char* RM_BS_REFERENCE = "rm -f ./phiX-bs.umfa";
23const char* RM_CS_REFERENCE = "rm -f ./phiX-cs.umfa";
24const char* REFERENCE_NAME = "./phiX.fa";
25
26TEST(GenomeSequenceTest, staticLookupTest)
27{
29 // quick sanity check...
30 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'A']], 'A');
31 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'a']], 'A');
32 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'T']], 'T');
33 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 't']], 'T');
34 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'C']], 'C');
35 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'c']], 'C');
36 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'G']], 'G');
37 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'g']], 'G');
38 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'N']], 'N');
39 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'n']], 'N');
40 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'M']], 'M');
41 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'm']], 'M');
42
43 EXPECT_EQ(GenomeSequence::base2int[(int) 'N'], 4);
44 EXPECT_EQ(GenomeSequence::base2int[(int) 'n'], 4);
45 EXPECT_EQ(GenomeSequence::base2int[(int) 'A'], 0);
46 EXPECT_EQ(GenomeSequence::base2int[(int) 'a'], 0);
47 EXPECT_EQ(GenomeSequence::base2int[(int) 'T'], 3);
48 EXPECT_EQ(GenomeSequence::base2int[(int) 't'], 3);
49 EXPECT_EQ(GenomeSequence::base2int[(int) 'C'], 1);
50 EXPECT_EQ(GenomeSequence::base2int[(int) 'c'], 1);
51 EXPECT_EQ(GenomeSequence::base2int[(int) 'G'], 2);
52 EXPECT_EQ(GenomeSequence::base2int[(int) 'g'], 2);
53}
54
55
56TEST(GenomeSequenceTest, testBaseSpaceReference)
57{
59 int exitCode = system(RM_BS_REFERENCE);
60 EXPECT_EQ(exitCode, 0);
61
62 s.setReferenceName(REFERENCE_NAME);
63 bool rc = s.create(false);
64 EXPECT_EQ(rc, false);
65 EXPECT_EQ(s[0], 'G');
66 EXPECT_EQ(s[1], 'A');
67 EXPECT_EQ(s[2], 'G');
68 EXPECT_EQ(s[s.getNumberBases()-3], 'G');
69 EXPECT_EQ(s[s.getNumberBases()-2], 'C');
70 EXPECT_EQ(s[s.getNumberBases()-1], 'A');
71 EXPECT_EQ(s[s.getNumberBases()], 'N'); // check bounds checker
72
73 s.close();
74}
75
76TEST(GenomeSequenceTest, testColorSpaceReference)
77{
79 int exitCode = system(RM_CS_REFERENCE);
80 EXPECT_EQ(exitCode, 0);
81
82 s.setReferenceName(REFERENCE_NAME);
83 bool rc = s.create(true);
84
85 // NB: I did not calculate these expected values, I just
86 // read them from the converted genome and set them here.
87 // So in theory, they should be checked by hand to ensure
88 // that they are correct.
89 EXPECT_EQ(rc, false);
90 EXPECT_EQ(s[0], 'N'); // in color space, first symbol is unknown
91 EXPECT_EQ(s[1], '2');
92 EXPECT_EQ(s[2], '2');
93 EXPECT_EQ(s[s.getNumberBases()-3], '1');
94 EXPECT_EQ(s[s.getNumberBases()-2], '3');
95 EXPECT_EQ(s[s.getNumberBases()-1], '1');
96 EXPECT_EQ(s[s.getNumberBases()], 'N'); // check bounds checker
97
98 s.close();
99}
100
101#if 0
102void simplestExample(void)
103{
104 GenomeSequence reference;
105 genomeIndex_t index;
106
107 // a particular reference is set by:
108 // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa")
109 //
110 // In the above example, the suffix .fa is stripped and replaced with .umfa,
111 // which contains the actual file being opened.
112 //
113 if (reference.open()) {
114 perror("GenomeSequence::open");
115 exit(1);
116 }
117
118
119 index = 1000000000; // 10^9
120 //
121 // Write the base at the given index. Here, index is 0 based,
122 // and is across the whole genome, as all chromosomes are sequentially
123 // concatenated, so the allowed range is
124 //
125 // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last))
126 //
127 // (where int last = reference.getChromosomeCount() - 1;)
128 //
129 std::cout << "base[" << index << "] = " << reference[index] << std::endl;
130
131 //
132 // Example for finding chromosome and one based chromosome position given
133 // and absolute position on the genome in 'index':
134 //
135 int chr = reference.getChromosome(index);
136 genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1; // 1-based
137
138 std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
139
140 //
141 // Example for finding an absolute genome index position when the
142 // chromosome name and one based position are known:
143 //
144 const char *chromosomeName = "5";
145 chr = reference.getChromosome(chromosomeName); // 0-based
146 chrIndex = 100000; // 1-based
147
148 index = reference.getChromosomeStart(chr) + chrIndex - 1;
149
150 std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
151
152 reference.close();
153}
154
155void testGenomeSequence(void)
156{
157 GenomeSequence reference;
158
159#if 0
160 std::string referenceName = "someotherreference";
161 if (reference.setFastaName(referenceName)) {
162 std::cerr << "failed to open reference file "
163 << referenceName
164 << std::endl;
165 exit(1);
166 }
167#endif
168
169 std::cerr << "open and prefetch the reference genome: ";
170
171 // open it
172 if (reference.open()) {
173 exit(1);
174 }
175 std::cerr << "done!" << std::endl;
176
177 //
178 // For the human genome, genomeIndex ranges from 0 to 3.2x10^9
179 //
180 genomeIndex_t genomeIndex; // 0 based
181 unsigned int chromosomeIndex; // 1 based
182 unsigned int chromosome; // 0..23 or so
183 std::string chromosomeName;
184
185 //
186 // Here we'll start with a chromosome name, then obtain the genome
187 // index, and use it to find the base we want:
188 //
189 chromosomeName = "2";
190 chromosomeIndex = 1234567;
191 // this call is slow (string search for chromsomeName):
192 genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
193 assert(genomeIndex!=INVALID_GENOME_INDEX);
194 std::cout << "Chromosome " << chromosomeName << ", index ";
195 std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
196 std::cout << " at genome index position " << genomeIndex << std::endl;
197
198 //
199 // now reverse it - given a genomeIndex from above, find the chromosome
200 // name and index:
201 //
202
203 // slow (binary search on genomeIndex):
204 chromosome = reference.getChromosome(genomeIndex);
205 unsigned int newChromosomeIndex;
206 // not slow:
207 newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
208
209 assert(chromosomeIndex == newChromosomeIndex);
210
211 // more testing... at least test and use PackedRead:
212 //
213 PackedRead pr;
214
215 pr.set("ATCGATCG", 0);
216 assert(pr.size()==8);
217 assert(pr[0]==GenomeSequence::base2int[(int) 'A']);
218 assert(pr[1]==GenomeSequence::base2int[(int) 'T']);
219 assert(pr[2]==GenomeSequence::base2int[(int) 'C']);
220 assert(pr[3]==GenomeSequence::base2int[(int) 'G']);
221 pr.set("ATCGATCG", 1);
222 assert(pr.size()==9);
223 pr.set("", 0);
224 assert(pr.size()==0);
225 pr.set("", 1);
226 assert(pr.size()==1);
227 pr.set("", 2);
228 assert(pr.size()==2);
229 pr.set("", 3);
230 assert(pr.size()==3);
231 assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
232 assert(pr[1]==GenomeSequence::base2int[(int) 'N']);
233 assert(pr[2]==GenomeSequence::base2int[(int) 'N']);
234 pr.set("C", 1);
235 assert(pr.size()==2);
236 assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
237 assert(pr[1]==GenomeSequence::base2int[(int) 'C']);
238
239}
240
241#endif
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
genomeIndex_t getChromosomeStart(int chromosomeIndex) const
given a chromosome, return the genome base it starts in
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
int getChromosome(genomeIndex_t position) const
given a whole genome index, get the chromosome it is located in
bool setReferenceName(std::string referenceFilename)
set the reference name that will be used in open()
genomeIndex_t getNumberBases() const
return the number of bases represented in this reference
bool open(bool isColorSpace=false, int flags=O_RDONLY)
open the reference specified using GenomeSequence::setReferenceName