libStatGen Software  1
GenomeSequenceHelpers.h
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef _GENOME_SEQUENCE_HELPERS_H
19 #define _GENOME_SEQUENCE_HELPERS_H
20 
21 #if !defined(MD5_DIGEST_LENGTH)
22 #define MD5_DIGEST_LENGTH 16
23 #endif
24 
25 #include "MemoryMapArray.h"
26 
27 #include <stdint.h>
28 
29 
30 //
31 // ChromosomeInfo represents the per chromosome information
32 // necessary to write out SAM/BAM records. In addition, it
33 // contains a single internal index used to point to the vector
34 // offset where the chromosome bases start.
35 //
36 // This is mildly non-optimal for larger collections of chromosomes
37 // or contigs - one use case described having millions of contigs,
38 // in which case, this structure alone would take a gigabyte or more.
39 //
41 {
42  static const int MAX_GENOME_INFO_STRING=128;
43 
44  void constructorClear()
45  {
46  memset(this,0, sizeof(*this));
47  }
48  void setChromosomeName(const char *n)
49  {
50  strncpy(name, n, sizeof(name)-1);
51  name[sizeof(name)-1] = '\0';
52  }
53  genomeIndex_t start; // internal offset to combined genome vector
54  genomeIndex_t size; // SAM SQ:LN value
55  char md5[2*MD5_DIGEST_LENGTH + 1]; // 32 chars plus NUL, SAM SQ:M5 value
56  char name[MAX_GENOME_INFO_STRING]; // SAM SQ:SN value
57  char assemblyID[MAX_GENOME_INFO_STRING]; // SAM SQ:AS value
58  char uri[MAX_GENOME_INFO_STRING]; // SAM SQ:UR value
59  char species[MAX_GENOME_INFO_STRING]; // SAM SQ:SP value
60 
61  // handy setting methods:
62  void setAssemblyID(const char *newID)
63  {
64  strncpy(assemblyID, newID, sizeof(assemblyID)-1);
65  name[sizeof(name)-1] = '\0';
66  }
67  void setSpecies(const char *newSpecies)
68  {
69  strncpy(species, newSpecies, sizeof(species)-1);
70  species[sizeof(species)-1] = '\0';
71  }
72  void setURI(const char *newURI)
73  {
74  strncpy(uri, newURI, sizeof(uri)-1);
75  uri[sizeof(uri)-1] = '\0';
76  }
77 };
78 
80 {
81  friend class GenomeSequence;
82  friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
83 private:
84  uint32_t _chromosomeCount;
85  bool _colorSpace;
86 
87  ChromosomeInfo _chromosomes[0];
88 
89 public:
90  //
91  // getHeaderSize is special in that it must not access any
92  // member variables, since it is called when the header has
93  // not been created yet.
94  //
95  static size_t getHeaderSize(int chromosomeCount)
96  {
97  return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
98  }
99  //
100  // below methods return TRUE if it failed, false otherwise (primarily
101  // a length check).
102  //
103 };
104 
105 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
106 
107 //
108 // define the genomeSequence array type:
109 //
110 // NB the access/set routines use the encoded base values in the range
111 // 0-15, not the corresponding base pair character.
112 //
113 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
114 {
115  if ((index&1)==0)
116  {
117  return ((uint8_t *) base)[index>>1] & 0xf;
118  }
119  else
120  {
121  return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
122  }
123 };
124 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
125 {
126  if ((index&1)==0)
127  {
128  ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
129  }
130  else
131  {
132  ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
133  }
134 }
135 
136 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
137 {
138  return sizeof(uint8_t) * i / 2;
139 }
140 
142 {
143  void set(int index, int val)
144  {
145  packedBases[index>>1] =
146  (packedBases[index>>1] // original value
147  & ~(7<<((index&0x01)<<2))) // logical AND off the original value
148  | ((val&0x0f)<<((index&0x1)<<2)); // logical OR in the new value
149  }
150 public:
151  std::vector<uint8_t> packedBases;
152  uint32_t length;
153  int size()
154  {
155  return length;
156  }
157  void clear()
158  {
159  packedBases.clear();
160  length=0;
161  }
162  void set(const char *rhs, int padWithNCount = 0);
163  uint8_t operator [](int index)
164  {
165  return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
166  }
167 };
168 
169 #endif
GenomeSequence
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Definition: GenomeSequence.h:100
PackedRead
Definition: GenomeSequenceHelpers.h:142
MemoryMapArrayHeader
Definition: MemoryMapArray.h:65
genomeSequenceMmapHeader
Definition: GenomeSequenceHelpers.h:80
operator<<
InputFile & operator<<(InputFile &stream, const std::string &str)
Write to a file using streaming.
Definition: InputFile.h:736
ChromosomeInfo
Definition: GenomeSequenceHelpers.h:41