libStatGen Software  1
PedigreeLoader.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "Pedigree.h"
19 #include "FortranFormat.h"
20 #include "Error.h"
21 
22 #include <stdlib.h>
23 #include <ctype.h>
24 #include <string.h>
25 
26 void Pedigree::Prepare(IFILE & input)
27 {
28  pd.Load(input);
29 }
30 
31 void Pedigree::Load(IFILE & input)
32 {
33  if (pd.mendelFormat)
34  {
35  LoadMendel(input);
36  return;
37  }
38 
39  int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
40 
41  int textCols = pd.CountTextColumns() + 5;
42  int oldCount = count;
43  bool warn = true;
44  int line = 0;
45 
46  String buffer;
47  StringArray tokens;
48 
49  while (!ifeof(input))
50  {
51  int field = 0;
52 
53  buffer.ReadLine(input);
54 
55  tokens.Clear();
56  tokens.AddTokens(buffer, WHITESPACE);
57 
58  if (tokens.Length() == 0) continue;
59  if (tokens[0].SlowCompare("end") == 0) break;
60 
61  line++;
62 
63  if (tokens.Length() < textCols)
64  {
65  if (buffer.Length() > 79)
66  {
67  buffer.SetLength(75);
68  buffer += " ...";
69  }
70 
71  String description;
72 
73  pd.ColumnSummary(description);
74  error("Loading Pedigree...\n\n"
75  "Expecting %d columns (%s),\n"
76  "but read only %d columns in line %d.\n\n"
77  "The problem line is transcribed below:\n%s\n",
78  textCols, (const char *) description,
79  tokens.Length(), line, (const char *) buffer);
80  }
81 
82  if (tokens.Length() > textCols && warn && textCols > 5)
83  {
84  pd.ColumnSummary(buffer);
85  printf("WARNING -- Trailing columns in pedigree file will be ignored\n"
86  " Expecting %d data columns (%s)\n"
87  " However line %d, for example, has %d data columns\n\n",
88  textCols - 5, (const char *) buffer, line, tokens.Length() - 5);
89  warn = false;
90  }
91 
92  Person * p;
93 
94  // create a new person if necessary
95  if (oldCount==0 || (p = FindPerson(tokens[0], tokens[1], oldCount))==NULL)
96  {
97  if (count == size) Grow();
98 
99  p = persons[count++] = new Person;
100  }
101 
102  p->famid = tokens[field++]; // famid
103  p->pid = tokens[field++]; // pid
104  p->fatid = tokens[field++]; // fatid
105  p->motid = tokens[field++]; // motid
106 
107  bool failure = false;
108  p->sex = TranslateSexCode(tokens[field++], failure);
109  if (failure)
110  error("Can't interpret the sex of individual #%d\n"
111  "Family: %s Individual: %s Sex Code: %s", count,
112  (const char *) p->famid, (const char *) p->pid,
113  (const char *) tokens[field-1]);
114 
115  if (sexAsCovariate)
116  {
117  if (p->sex)
118  p->covariates[sexCovariate] = p->sex;
119  else
120  p->covariates[sexCovariate] = _NAN_;
121  }
122 
123  for (int col = 0; col < pd.columnCount; col++)
124  switch (pd.columns[col])
125  {
126  case pcAffection :
127  {
128  int a = pd.columnHash[col];
129  int new_status;
130 
131  const char * affection = tokens[field++];
132 
133  switch (toupper(affection[0]))
134  {
135  case '1' :
136  case 'N' :
137  case 'U' :
138  new_status = 1;
139  break;
140  case '2' :
141  case 'D' :
142  case 'A' :
143  case 'Y' :
144  new_status = 2;
145  break;
146  default :
147  new_status = atoi(affection);
148  if (new_status < 0 || new_status > 2)
149  error("Incorrect formating for affection status "
150  "Col %d, Affection %s\n"
151  "Family: %s Individual: %s Status: %s",
152  col, (const char *) affectionNames[a],
153  (const char *) p->famid, (const char *) p->pid,
154  affection);
155  }
156  if (new_status != 0 && p->affections[a] != 0 &&
157  new_status != p->affections[a])
158  error("Conflict with previous affection status - "
159  "Col %d, Affection %s\n"
160  "Family: %s Individual: %s Old: %d New: %d",
161  col, (const char *) affectionNames[a],
162  (const char *) p->famid, (const char *) p->pid,
163  p->affections[a], new_status);
164  if (new_status) p->affections[a] = new_status;
165  break;
166  }
167  case pcMarker :
168  {
169  int m = pd.columnHash[col];
170 
171  Alleles new_genotype;
172 
173  new_genotype[0] = LoadAllele(m, tokens[field++]);
174  new_genotype[1] = LoadAllele(m, tokens[field++]);
175 
176  if (p->markers[m].isKnown() && new_genotype.isKnown() &&
177  new_genotype != p->markers[m])
178  {
179  MarkerInfo * info = GetMarkerInfo(m);
180 
181  error("Conflict with previous genotype - Col %d, Marker %s\n"
182  "Family: %s Individual: %s Old: %s/%s New: %s/%s",
183  col, (const char *) markerNames[m],
184  (const char *) p->famid, (const char *) p->pid,
185  (const char *) info->GetAlleleLabel(p->markers[m][0]),
186  (const char *) info->GetAlleleLabel(p->markers[m][1]),
187  (const char *) info->GetAlleleLabel(new_genotype[0]),
188  (const char *) info->GetAlleleLabel(new_genotype[1]));
189  }
190 
191  if (new_genotype.isKnown()) p->markers[m] = new_genotype;
192  break;
193  }
194  case pcTrait :
195  case pcUndocumentedTraitCovariate :
196  {
197  int t = pd.columnHash[col];
198  double new_pheno = _NAN_;
199 
200  if (pd.columns[col] == pcUndocumentedTraitCovariate)
201  t = t / 32768;
202 
203  const char * value = tokens[field++];
204  char * flag = NULL;
205 
206  if (missing == (const char *) NULL || strcmp(value, missing) != 0)
207  new_pheno = strtod(value, &flag);
208  if (flag != NULL && *flag) new_pheno = _NAN_;
209 
210  if (p->traits[t] != _NAN_ && new_pheno != _NAN_ &&
211  new_pheno != p->traits[t])
212  error("Conflict with previous phenotype - Col %d, Trait %s\n"
213  "Family: %s Individual: %s Old: %f New: %f",
214  col, (const char *) traitNames[t],
215  (const char *) p->famid, (const char *) p->pid,
216  p->traits[t], new_pheno);
217 
218  if (new_pheno != _NAN_) p->traits[t] = new_pheno;
219  if (pd.columns[col] == pcTrait) break;
220  }
221  case pcCovariate :
222  {
223  int c = pd.columnHash[col];
224  double new_covar = _NAN_;
225 
226  if (pd.columns[col] == pcUndocumentedTraitCovariate)
227  {
228  c = c % 32768;
229  field--;
230  }
231 
232  const char * value = tokens[field++];
233  char * flag = NULL;
234 
235  if (missing == (const char *) NULL || strcmp(value, missing) != 0)
236  new_covar = strtod(value, &flag);
237  if (flag != NULL && *flag) new_covar = _NAN_;
238 
239  if (p->covariates[c] != _NAN_ && new_covar != _NAN_ &&
240  new_covar != p->covariates[c])
241  error("Conflict with previous value - Col %d, Covariate %s\n"
242  "Family: %s Individual: %s Old: %f New: %f",
243  col, (const char *) covariateNames[c],
244  (const char *) p->famid, (const char *) p->pid,
245  p->covariates[c], new_covar);
246 
247  if (new_covar != _NAN_) p->covariates[c] = new_covar;
248  break;
249  }
250  case pcString :
251  {
252  int c = pd.columnHash[col];
253 
254  if (!p->strings[c].IsEmpty() && p->strings[c] != tokens[field])
255  error("Conflict with previous value - Col %d, String %s\n"
256  "Family: %s Individual: %s Old: %s New: %s",
257  col, (const char *) stringNames[c],
258  (const char *) p->famid, (const char *) p->pid,
259  (const char *) p->strings[c], (const char *) tokens[field]);
260 
261  p->strings[c] = tokens[field++];
262 
263  break;
264  }
265  case pcSkip :
266  field++;
267  break;
268  case pcZygosity :
269  {
270  int new_zygosity;
271 
272  const char * zygosity = tokens[field++];
273 
274  switch (zygosity[0])
275  {
276  case 'D' :
277  case 'd' :
278  new_zygosity = 2;
279  break;
280  case 'M' :
281  case 'm' :
282  new_zygosity = 1;
283  break;
284  default :
285  new_zygosity = atoi(zygosity);
286  }
287  if (p->zygosity != 0 && new_zygosity != p->zygosity)
288  error("Conflict with previous zygosity - "
289  "Column %d in pedigree\n"
290  "Family: %s Individual: %s Old: %d New: %d\n",
291  col, (const char *) p->famid, (const char *) p->pid,
292  p->zygosity, new_zygosity);
293  p->zygosity = new_zygosity;
294  break;
295  }
296  case pcEnd :
297  break;
298  default :
299  error("Inconsistent Pedigree Description -- Internal Error");
300  }
301  }
302 
303  Sort();
304 }
305 
306 void Pedigree::LoadMendel(IFILE & input)
307 {
308  // First, retrieve the two format statements from file
309  String familyHeader;
310  String individualRecord;
311 
312  familyHeader.ReadLine(input);
313  individualRecord.ReadLine(input);
314 
315  // Then create two FORTRAN input streams...
316  // One will be used for retrieving family labels and sizes, the other
317  // will be used for individual information
318  FortranFormat headers, records;
319 
320  headers.SetInputFile(input);
321  headers.SetFormat(familyHeader);
322 
323  records.SetInputFile(input);
324  records.SetFormat(individualRecord);
325 
326  // Storage for key pieces of information
327  String famid;
328  String phenotype;
329  String affectionCode;
330  String affectionStem;
331  int familySize;
332 
333  String allele1, allele2;
334 
335  int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
336 
337  while (!ifeof(input))
338  {
339  if (count == size)
340  Grow();
341 
342  // Retrieve header for next family
343  familySize = headers.GetNextInteger();
344  headers.GetNextField(famid);
345  headers.Flush();
346 
347  if (famid.IsEmpty())
348  {
349  if (ifeof(input) && familySize == 0)
350  break;
351  else
352  error("Blank family id encountered\n");
353  }
354 
355  // Retrieve each individual in the family
356  for (int i = 0; i < familySize; i++)
357  {
358  Person * p = persons[count++] = new Person;
359 
360  // Retrieve basic pedigree structure
361  p->famid = famid;
362  records.GetNextField(p->pid);
363  records.GetNextField(p->fatid);
364  records.GetNextField(p->motid);
365 
366  if (p->pid.IsEmpty())
367  error("No unique identifier for individual #%d in family %s\n",
368  i + 1, (const char *) famid);
369 
370  if (p->pid.Compare(".") == 0)
371  error("Family %s has an individual named '.', but this code is\n"
372  "reserved to indicate missing parents\n");
373 
374  if (p->fatid.IsEmpty()) p->fatid = ".";
375  if (p->motid.IsEmpty()) p->motid = ".";
376 
377  // Retrieve and decode sex code
378  char sex = records.GetNextCharacter();
379 
380  switch (sex)
381  {
382  case '0' :
383  case 'x' :
384  case 'X' :
385  case '?' :
386  case 0 :
387  p->sex = 0;
388  break;
389  case '1' :
390  case 'm' :
391  case 'M' :
392  p->sex = 1;
393  break;
394  case '2' :
395  case 'f' :
396  case 'F' :
397  p->sex = 2;
398  break;
399  default :
400  error("Can't interpret the sex of individual #%d\n"
401  "Family: %s Individual: %s Sex Code: %s", count,
402  (const char *) p->famid, (const char *) p->pid, sex);
403  };
404 
405  if (sexAsCovariate)
406  {
407  if (p->sex)
408  p->covariates[sexCovariate] = p->sex;
409  else
410  p->covariates[sexCovariate] = _NAN_;
411  }
412 
413  // Retrieve and decode zygosity
414  char zygosity = records.GetNextCharacter();
415 
416  // Mendel uses a unique character to indicate each MZ pair,
417  // we use a unique odd number...
418  if (zygosity)
419  p->zygosity = (zygosity - ' ') * 2 - 1;
420 
421  affectionStem.Clear();
422  for (int col = 0; col < pd.columnCount; col++)
423  switch (pd.columns[col])
424  {
425  case pcAffection :
426  {
427  int a = pd.columnHash[col];
428 
429  // We expand each Mendel non-codominant trait into multiple
430  // affection status column... First, if this is not a
431  // continuation of a previous expansion we first retrieve
432  // and encode the affection status.
433  if (affectionStem.Length() == 0 ||
434  affectionNames[a].CompareToStem(affectionStem) != 0)
435  {
436  affectionStem.Copy(affectionNames[a], 0, affectionNames[a].FindChar('>') + 1);
437  records.GetNextField(phenotype);
438  affectionCode = affectionStem + phenotype;
439  }
440 
441  // Then encode each phenotype appropriately
442  if (phenotype.IsEmpty())
443  p->affections[a] = 0;
444  else
445  p->affections[a] = affectionCode.Compare(affectionNames[a]) == 0 ? 2 : 1;
446 
447  break;
448  }
449  case pcMarker :
450  {
451  int m = pd.columnHash[col];
452 
453  records.GetNextField(phenotype);
454 
455  if (phenotype.IsEmpty())
456  {
457  p->markers[m].one = p->markers[m].two = 0;
458  continue;
459  }
460 
461  int separator = phenotype.FindChar('/');
462  if (separator == -1) separator = phenotype.FindChar('|');
463 
464  if (separator == -1)
465  error("At marker %s, person %s in family %s has genotype %s.\n"
466  "This genotype is not in the 'al1/al2' format.\n",
467  (const char *) markerNames[m],
468  (const char *) p->pid,
469  (const char *) p->famid,
470  (const char *) phenotype);
471 
472  allele1.Copy(phenotype, 0, separator);
473  allele1.Trim();
474 
475  allele2.Copy(phenotype, separator + 1, 8);
476  allele2.Trim();
477 
478  MarkerInfo * info = GetMarkerInfo(m);
479 
480  int one = info->alleleNumbers.Integer(allele1);
481 
482  if (one < 0)
483  {
484  if (info->freq.Length() == 0)
485  one = info->NewAllele(allele1);
486  else
487  error("At marker %s, person %s in family %s has genotype %s.\n"
488  "However, '%s' is not a valid allele for this marker.\n",
489  (const char *) markerNames[m],
490  (const char *) p->pid,
491  (const char *) p->famid,
492  (const char *) phenotype,
493  (const char *) allele1);
494  }
495 
496  int two = info->alleleNumbers.Integer(allele2);
497 
498  if (two < 0)
499  {
500  if (info->freq.Length() == 0)
501  two = info->NewAllele(allele2);
502  else
503  error("At marker %s, person %s in family %s has genotype %s.\n"
504  "However, '%s' is not a valid allele for this marker.\n",
505  (const char *) markerNames[m],
506  (const char *) p->pid,
507  (const char *) p->famid,
508  (const char *) phenotype,
509  (const char *) allele2);
510  }
511 
512  p->markers[m].one = one;
513  p->markers[m].two = two;
514  break;
515  }
516  case pcEnd :
517  break;
518  case pcTrait :
519  case pcCovariate :
520  case pcSkip :
521  case pcZygosity :
522  default:
523  error("Inconsistent Pedigree Description -- Internal Error");
524  }
525 
526  records.Flush();
527  }
528  }
529 
530  Sort();
531 }
532 
533 void Pedigree::Prepare(const char * filename)
534 {
535  // Clear any previously loaded pedigree description
536  if (multiPd != NULL)
537  delete [] multiPd;
538 
539  multiFileCount = 1;
540 
541  // Enable multifile support
542  StringArray filenames;
543 
544  filenames.AddColumns(filename, ',');
545 
546  if (filenames.Length() <= 1)
547  pd.Load(filename);
548  else
549  {
550  printf("AUTOMATIC MERGE ENABLED: Detected multiple datafile names, separated by commas...\n");
551 
552  multiPd = new PedigreeDescription[filenames.Length()];
553 
554  for (int i = 0; i < filenames.Length(); i++)
555  {
556  printf(" AUTOMATIC MERGE: Reading data file '%s' ...\n", (const char *) filenames[i]);
557  multiPd[i].Load(filenames[i], false);
558  }
559 
560  multiFileCount = filenames.Length();
561  }
562 }
563 
564 void Pedigree::Load(const char * filename, bool allowFailures)
565 {
566  if (multiFileCount <= 1)
567  {
568  IFILE f = ifopen(filename, "rb");
569 
570  if (f == NULL && allowFailures)
571  return;
572 
573  if (f == NULL)
574  error(
575  "The pedigree file %s cannot be opened\n\n"
576  "Common causes for this problem are:\n"
577  " * You might not have used the correct options to specify input file names,\n"
578  " please check the program documentation for information on how to do this\n\n"
579  " * The file doesn't exist or the filename might have been misspelt\n\n"
580  " * The file exists but it is being used by another program which you will need\n"
581  " to close\n\n"
582  " * The file is larger than 2GB and you haven't compiled this application with\n"
583  " large file support.\n\n",
584  filename);
585 
586  Load(f);
587  ifclose(f);
588  }
589  else
590  {
591  StringArray filenames;
592 
593  filenames.AddColumns(filename, ',');
594 
595  if (filenames.Length() != multiFileCount)
596  error("Different numbers of comma separated data and pedigree file names provided\n");
597 
598  for (int i = 0; i < filenames.Length(); i++)
599  {
600  printf(" AUTOMATIC MERGE: Datafile '%s' matched to pedigree '%s' ...\n",
601  (const char *) multiPd[i].filename, (const char *) filenames[i]);
602 
603  pd = multiPd[i];
604 
605  IFILE f = ifopen(filenames[i], "rb");
606 
607  if (f == NULL)
608  error("The pedigree file '%s' cannot be opened\n\n",
609  (const char *) filenames[i]);
610 
611  Load(f);
612  ifclose(f);
613  }
614 
615  printf("\n");
616  }
617 }
618 
619 int Pedigree::TranslateSexCode(const char * code, bool & failure)
620 {
621  failure = false;
622 
623  switch (code[0])
624  {
625  case 'x' :
626  case 'X' :
627  case '?' :
628  return 0;
629  case '1' :
630  case 'm' :
631  case 'M' :
632  return 1;
633  case '2' :
634  case 'f' :
635  case 'F' :
636  return 2;
637  default :
638  {
639  int result = atoi(code);
640 
641  if (result != 0 && result != 1 && result != 2)
642  {
643  failure = true;
644  result = 0;
645  }
646 
647  return result;
648  }
649  };
650 }
Alleles
Definition: PedigreeAlleles.h:24
String
Definition: StringBasics.h:39
ifopen
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
ifeof
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition: InputFile.h:654
FortranFormat
Definition: FortranFormat.h:25
MarkerInfo
Definition: PedigreeGlobals.h:30
InputFile
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
PedigreeDescription
Definition: PedigreeDescription.h:42
ifclose
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
Person
Definition: PedigreePerson.h:32
StringArray
Definition: StringArray.h:24