1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
|
\documentclass[11pt]{article}
\usepackage{amsmath,latexsym,natbib,fullpage,color,subfigure,rotating,
url,setspace,multirow,amssymb,hyperref}
\onehalfspacing
\providecommand{\url}[1]{\texttt{#1}}
\title{GEMMA User Manual}
\author{Xiang Zhou}
\date{\today}
\newcommand{\me}{\mathrm{e}}
\newcommand{\supp}{\operatorname{supp}}
\newcommand{\abs}[1]{\left|#1\right|}
\newcommand{\comment}[1]{{\em #1}}
\newcommand{\ba}{\mathbf{a}}
\newcommand{\bb}{\mathbf{b}}
\newcommand{\be}{\mathbf{e}}
\newcommand{\bg}{\mathbf{g}}
\newcommand{\bq}{\mathbf{q}}
\newcommand{\bk}{\mathbf{k}}
\newcommand{\bv}{\mathbf{v}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\by}{\mathbf{y}}
\newcommand{\bz}{\mathbf{z}}
\newcommand{\bu}{\mathbf{u}}
\newcommand{\bw}{\mathbf{w}}
\newcommand{\bm}{\mathbf{m}}
\newcommand{\w}{\mathbf{w}}
\newcommand{\bK}{\mathbf{K}}
\newcommand{\bV}{\mathbf{V}}
\newcommand{\bA}{\mathbf{A}}
\newcommand{\bB}{\mathbf{B}}
\newcommand{\bX}{\mathbf{X}}
\newcommand{\bY}{\mathbf{Y}}
\newcommand{\bE}{\mathbf{E}}
\newcommand{\bG}{\mathbf{G}}
\newcommand{\bH}{\mathbf{H}}
\newcommand{\bQ}{\mathbf{Q}}
\newcommand{\bP}{\mathbf{P}}
\newcommand{\bW}{\mathbf{W}}
\newcommand{\bM}{\mathbf{M}}
\newcommand{\bU}{\mathbf{U}}
\newcommand{\bZ}{\mathbf{Z}}
\newcommand{\bD}{\mathbf{D}}
\newcommand{\bI}{\mathbf{I}}
\newcommand{\bepsilon}{\boldsymbol\epsilon}
\newcommand{\bbeta}{\boldsymbol\beta}
\newcommand{\tbbeta}{{\tilde{\boldsymbol\beta}}}
\newcommand{\tbeta}{{\tilde{\beta}}}
\newcommand{\bgamma}{\boldsymbol\gamma}
\newcommand{\bdelta}{\boldsymbol\delta}
\newcommand{\btheta}{\boldsymbol\theta}
\newcommand{\bpsi}{\boldsymbol\psi}
\newcommand{\bphi}{\boldsymbol\phi}
\newcommand{\balpha}{\boldsymbol\alpha}
\newcommand{\bOmega}{\boldsymbol\Omega}
\newcommand{\bSigma}{\boldsymbol\Sigma}
\begin{document}
\maketitle
\tableofcontents
\newpage
\section{Introduction}
\subsection{What is GEMMA}
GEMMA is the software implementing the Genome-wide Efficient Mixed
Model Association algorithm \cite{Zhou:2012} for a standard linear
mixed model and some of its close relatives for genome-wide
association studies (GWAS). It fits a univariate linear mixed model
(LMM) for marker association tests with a single phenotype to account
for population stratification and sample structure, and for estimating
the proportion of variance in phenotypes explained (PVE) by typed
genotypes (i.e. ``chip heritability'' or ``SNP heritability'') \cite{Zhou:2012}. It fits a
multivariate linear mixed model (mvLMM) for testing marker
associations with multiple phenotypes simultaneously while controlling
for population stratification, and for estimating genetic correlations
among complex phenotypes \cite{Zhou:2014}. It fits a Bayesian sparse
linear mixed model (BSLMM) using Markov chain Monte Carlo (MCMC) for
estimating PVE by typed genotypes, predicting phenotypes, and
identifying associated markers by jointly modeling all markers while
controlling for population structure \cite{Zhou:2013}. It fits HE,
REML and MQS for variance component estimation using either
individual-level data or summary statistics \cite{Zhou:2016}. It is
computationally efficient for large scale GWAS and uses freely
available open-source numerical libraries.
\subsection{How to Cite GEMMA}
\begin{itemize}
\item Software tool and univariate linear mixed models \\ Xiang Zhou
and Matthew Stephens (2012). Genome-wide efficient mixed-model
analysis for association studies. Nature Genetics. 44: 821-824.
\item Multivariate linear mixed models \\ Xiang Zhou and Matthew
Stephens (2014). Efficient multivariate linear mixed model
algorithms for genome-wide association studies. Nature Methods. 11:
407-409.
\item Bayesian sparse linear mixed models \\ Xiang Zhou, Peter
Carbonetto and Matthew Stephens (2013). Polygenic modeling with
Bayesian sparse linear mixed models. PLoS Genetics. 9(2): e1003264.
\item Variance component estimation with individual-level or summary
data\\ Xiang Zhou (2016). A unified framework for variance component
estimation with summary statistics in genome-wide association
studies. bioRxiv. 042846.
\end{itemize}
\subsection{Models}
\subsubsection{Univariate Linear Mixed Model}
GEMMA can fit a univariate linear mixed model in the following form:
%
\begin{equation*}
\by=\bW \balpha+\bx\beta+\bu+\bepsilon; \quad \bu\sim \mbox{MVN}_n(0, \lambda \tau^{-1} \bK), \quad \bepsilon \sim \mbox{MVN}_n(0, \tau^{-1} \bI_n),
\end{equation*}
%
where $\by$ is an $n$-vector of quantitative traits (or binary disease
labels) for $n$ individuals; $\bW=(\bw_1, \cdots, \bw_c)$ is an
$n\times c$ matrix of covariates (fixed effects) including a column of
1s; $\boldsymbol \alpha$ is a $c$-vector of the corresponding
coefficients including the intercept; $\bx$ is an $n$-vector of marker
genotypes; $\beta$ is the effect size of the marker and is an estimate
of the marker/SNP additive effect; $\bu$ is an
$n$-vector of random effects; $\bepsilon$ is an $n$-vector of errors;
$\tau^{-1}$ is the variance of the residual errors; $\lambda$ is the
ratio between the two variance components; $\bK$ is a known $n\times
n$ relatedness matrix and $\bI_n$ is an $n\times n$ identity
matrix. $\mbox{MVN}_n$ denotes the $n$-dimensional multivariate normal
distribution.
GEMMA tests the alternative hypothesis $H_1: \beta\neq 0$ against the
null hypothesis $H_0: \beta=0$ for each SNP in turn, using one of the
three commonly used test statistics (Wald, likelihood ratio or
score). GEMMA obtains either the maximum likelihood estimate (MLE) or
the restricted maximum likelihood estimate (REML) of $\lambda$ and
$\beta$, and outputs the corresponding $p$ value.
In addition, GEMMA estimates the PVE by typed genotypes or ``chip or
SNP heritability''.
\subsubsection{Multivariate Linear Mixed Model}
GEMMA can fit a multivariate linear mixed model in the following form:
%
\begin{equation*}
\bY = \bW \bA+\bx\bbeta^T+\bU+\bE; \quad
\bG \sim \mbox{MN}_{n \times d}({\bf 0}, \bK, \bV_g),
\quad \bE \sim \mbox{MN}_{n \times d}({\bf 0}, \bI_{n\times n}, \bV_e),
\end{equation*}
%
where $\bY$ is an $n$ by $d$ matrix of $d$ phenotypes for $n$
individuals; $\bW=(\bw_1, \cdots, \bw_c)$ is an $n\times c$ matrix of
covariates (fixed effects) including a column of 1s; $\bA$ is a $c$ by
$d$ matrix of the corresponding coefficients including the intercept;
$\bx$ is an $n$-vector of marker genotypes; $\bbeta$ is a $d$ vector
of marker effect sizes for the $d$ phenotypes; $\bU$ is an $n$ by $d$
matrix of random effects; $\bE$ is an $n$ by $d$ matrix of errors;
$\bK$ is a known $n$ by $n$ relatedness matrix, $\bI_{n\times n}$ is a
$n$ by $n$ identity matrix, $\bV_g$ is a $d$ by $d$ symmetric matrix
of genetic variance component, $\bV_e$ is a $d$ by $d$ symmetric
matrix of environmental variance component and $\mbox{MN}_{n \times
d}({\bf 0}, \bV_1, \bV_2)$ denotes the $n \times d$ matrix normal
distribution with mean 0, row covariance matrix $\bV_1$ ($n$ by $n$),
and column covariance matrix $\bV_2$ ($d$ by $d$).
GEMMA performs tests comparing the null hypothesis that the marker
effect sizes for all phenotypes are zero, $H_0: \bbeta= {\bf 0}$,
where ${\bf 0}$ is a $d$-vector of zeros, against the general
alternative $H_1: \bbeta\neq {\bf 0}$. For each SNP in turn, GEMMA
obtains either the maximum likelihood estimate (MLE) or the restricted
maximum likelihood estimate (REML) of $\bV_g$ and $\bV_e$, and outputs
the corresponding $p$ value.
In addition, GEMMA estimates the genetic and environmental
correlations among phenotypes.
\subsubsection{Bayesian Sparse Linear Mixed Model}
GEMMA can fit a Bayesian sparse linear mixed model in the following
form as well as a corresponding probit counterpart:
%
\begin{equation*}
\mathbf y=\mathbf 1_n\mu+\bX\bbeta+\bu+\bepsilon;
\quad \beta_i \sim \pi \mbox{N}(0, \sigma_a^2\tau^{-1})+(1-\pi)\delta_0,
\quad \bu\sim \mbox{MVN}_n(0, \sigma_b^2 \tau^{-1} \bK), \quad
\bepsilon \sim \mbox{MVN}_n(0, \tau^{-1} \bI_n),
\end{equation*}
%
where $\mathbf 1_n$ is an $n$-vector of 1s, $\mu$ is a scalar
representing the phenotype mean, $\mathbf X$ is an $n \times p$ matrix
of genotypes measured on $n$ individuals at $p$ genetic markers,
$\bbeta$ is the corresponding $p$-vector of the genetic marker
effects, and other parameters are the same as defined in the standard
linear mixed model in the previous section.
In the special case $\bK=\bX\bX^T/p$ (default in GEMMA), the SNP
effect sizes can be decomposed into two parts: $\balpha$ that captures
the small effects that all SNPs have, and $\bbeta$ that captures the
additional effects of some large effect SNPs. In this case,
$\bu=\bX\balpha$ can be viewed as the combined effect of all small
effects, and the total effect size for a given SNP is
$\alpha_i+\beta_i$.
There are two important hyper-parameters in the model: PVE, being the
proportion of variance in phenotypes explained by the sparse effects
($\bX\bbeta$) and random effects terms ($\bu$) together, and PGE,
being the proportion of genetic variance explained by the sparse
effects terms ($\mathbf X\boldsymbol\beta$). These two parameters are
defined as follows:
\begin{align*}
\mbox{PVE}(\bbeta, \bu, \tau) &
:=\frac{\mbox{V}(\bX\bbeta+\bu)}{\mbox{V}(\bX\bbeta+\bu)+\tau^{-1}}, \\
\mbox{PGE}(\bbeta, \bu)
& :=\frac{\mbox{V}(\bX\bbeta)}{\mbox{V}(\bX\bbeta+\bu)}, \\
\intertext{where}
\mbox{V}({\bf x}) &:=\frac{1}{n}\sum_{i=1}^n (x_i-\overline {x})^2.
\end{align*}
GEMMA uses MCMC to estimate $\boldsymbol\beta$, $\mathbf u$ and all
other hyper-parameters including PVE, PGE and $\pi$.
\subsubsection{Variance Component Models}
GEMMA can be used to estimate variance components from a
multiple-component linear mixed model in the following form:
%
\begin{equation*}
\by=\sum_{i=1}^k \bX_i\bbeta_i+\bepsilon;
\quad \beta_{il} \sim \mbox{N}(0, \sigma_i^2/p_i),
\quad \bepsilon \sim \mbox{MVN}_n(0, \sigma_e^2 \bI_n),
\end{equation*}
%
which is equivalent to
%
\begin{equation*}
\by=\sum_{i=1}^k \bu_i +\bepsilon;
\quad \bu_i \sim \mbox{MVN}_n(0, \sigma_i^2 \bK_i),
\quad \bepsilon \sim \mbox{MVN}_n(0, \sigma_e^2 \bI_n),
\end{equation*}
%
where genetic markers are classified into $k$ non-overlapping
categories; $\bX_i$ is an $n \times p_i$ matrix of genotypes measured
on $n$ individuals at $p_i$ genetic markers in $i$'th category;
$\bbeta_i$ is the corresponding $p_i$-vector of the genetic marker
effects, where each element follows a normal distribution with
variance $\sigma_i^2/p_i$; $u_i$ is the combined genetic effects from
$i$'th category; $\bK_i=\bX_i\bX_i^T/p_i$ is the category specific
genetic relatedness matrix; and other parameters are the same as
defined in the standard linear mixed model in the previous section.
GEMMA estimates the variance components $\sigma_i^2$. When
individual-level data are available, GEMMA uses the HE regression
method or the REML average information (AI) algorithm for
estimation. When summary-level data are available, GEMMA uses MQS
(MINQUE for Summary Statistics) for estimation.
\subsection{Missing Data}
\subsubsection{Missing Genotypes}
As mentioned before \cite{Zhou:2012}, the tricks used in the GEMMA
algorithm rely on having complete or imputed genotype data at each
SNP. That is, GEMMA requires the user to impute all missing genotypes
before association testing. This imputation step is arguably
preferable than simply dropping individuals with missing genotypes,
since it can improve power to detect associations \cite{Guan:2008}.
Therefore, for fitting both LMM or BSLMM, missing genotypes are
recommended to be imputed first. Otherwise, any SNPs with missingness
above a certain threshold (default $5\%$) will not be analyzed, and
missing genotypes for SNPs that do not pass this threshold will be
simply replaced with the estimated mean genotype of that SNP. For
predictions, though, all SNPs will be used regardless of their
missingness. Missing genotypes in the test set will be replaced by the
mean genotype in the training set.
\subsubsection{Missing Phenotypes}
Individuals with missing phenotypes will not be included in the LMM or
BSLMM analysis. However, all individuals will be used for calculating
the relatedness matrix, so that the resulting relatedness matrix is
still an $n\times n$ matrix regardless of how many individuals have
missing phenotypes. In addition, predicted values will be calculated
for individuals with missing values, based on individuals with
non-missing values.
For relatedness matrix calculation, because missingness and minor
allele frequency for a given SNP are calculated based on analyzed
individuals (i.e. individuals with no missing phenotypes and no
missing covariates), if all individuals have missing phenotypes, then
no SNP and no individuals will be included in the analysis and the
estimated relatedness matrix will be full of ``nan"s.
\newpage
\section{Installing and Compiling GEMMA}
If you have downloaded a binary executable, no installation is
necessary. In some cases, you may need to use ``chmod a+x gemma"
before using the binary executable. In addition, notice that the
end-of-line coding in Windows (DOS) is different from that in Linux,
and so you may have to convert input files using the utility {\it
dos2unix} or {\it unix2dos} in order to use them in a different
platform.
The binary executable of GEMMA works well for a reasonably large
number of individuals (say, for example, the ``-eigen " option works
for at least 45,000 individuals).
If you want to compile GEMMA by yourself, you will need to download
the source code, and you will need a standard C/C++ compiler such as
GNU gcc, as well as GSL and OpenBLAS libraries. A sample
Makefile is provided along with the source code.
\newpage
\section{Input File Formats}
GEMMA requires four main input files containing genotypes, phenotypes,
relatedness matrix and (optionally) covariates. Genotype and phenotype
files can be in two formats, either both in the PLINK binary ped
format or both in the BIMBAM format. Mixing genotype and phenotype
files from the two formats (for example, using PLINK files for
genotypes and using BIMBAM files for phenotypes) will result in
unwanted errors. BIMBAM format is particularly useful for imputed
genotypes, as PLINK codes genotypes using 0/1/2, while BIMBAM can
accommodate any real values between 0 and 2 (and any real values if
paired with ``-notsnp'' option). In addition, to estimate variance
components using summary statistics, GEMMA requires two other input
files: one contains marginal z-scores and the other contains SNP
category.
Notice that the BIMBAM mean genotype file, the relatedness matrix
file, the marginal z-score file and the category file can be provided
in compressed gzip format, while other files should be provided in
uncompressed format.
\subsection{PLINK Binary PED File Format}
GEMMA recognizes the PLINK binary ped file format
(\url{http://pngu.mgh.harvard.edu/~purcell/plink/})
\cite{Purcell:2007} for both genotypes and phenotypes. This format
requires three files: *.bed, *.bim and *.fam, all with the same
prefix. The *.bed file should be in the default SNP-major mode
(beginning with three bytes). One can use the PLINK software to
generate binary ped files from standard ped files using the following
command:
%
\begin{verbatim}
plink --file [file_prefix] --make-bed --out [bedfile_prefix]
\end{verbatim}
%
For the *.fam file, GEMMA only reads the second column (individual id)
and the sixth column (phenotype). One can specify a different column
as the phenotype column by using ``-n [num]", where "-n 1" uses the
original sixth column as phenotypes, and ``-n 2" uses the seventh
column, and so on and so forth.
GEMMA codes alleles as 0/1 according to the plink webpage on binary
plink format
(\url{http://pngu.mgh.harvard.edu/~purcell/plink/binary.shtml}). Specifically,
the column 5 of the *.bim file is the minor allele and is coded as 1,
while the column 6 of the *.bim file is the major allele and is coded
as 0. The minor allele in column 5 is therefore the effect allele
(notice that GEMMA version 0.92 and before treats the major allele as
the effect allele).
GEMMA will read the phenotypes as provided and will recognize either
``-9" or ``NA" as missing phenotypes. If the phenotypes in the *.fam
file are disease status, one is recommended to label controls as 0 and
cases as 1, as the results will have better interpretation. For
example, the predicted values from a linear BSLMM can be directly
interpreted as the probability of being a case. In addition, the
probit BSLMM will only recognize 0/1 as control/case labels.
For prediction problems, one is recommended to list all individuals in
the file, but label those individuals in the test set as missing. This
will facilitate the use of the prediction function implemented in
GEMMA.
\subsection{BIMBAM File Format}
GEMMA also recognizes BIMBAM file format
(\url{http://stephenslab.uchicago.edu/software.html})
\cite{Guan:2008}, which is particularly useful for imputed genotypes
as well as for general covariates other than SNPs. BIMBAM format
consists of three files, a mean genotype file, a phenotype file, and
an optional SNP annotation file. We explain these files in detail
below.
\subsubsection{Mean Genotype File}
This file contains genotype information. The first column is SNP id,
the second and third columns are allele types with minor allele first,
and the remaining columns are the posterior/imputed mean genotypes of
different individuals numbered between 0 and 2. An example mean
genotype file with two SNPs and three individuals is as follows:
%
\begin{verbatim}
rs1, A, T, 0.02, 0.80, 1.50
rs2, G, C, 0.98, 0.04, 1.00
\end{verbatim}
%
GEMMA codes alleles exactly as provided in the BIMBAM mean genotype file, and
ignores the allele types in the second and third columns. Therefore,
the minor allele is the effect allele only if one codes minor allele
as 2 and major allele as 0.
Missing genotypes are represented as ``NA'' values.
One can use the following bash command (in one line) to generate
BIMBAM mean genotype file from IMPUTE genotype files \\
(\url{http://www.stats.ox.ac.uk/~marchini/software/gwas/file_format.html})\cite{Howie:2009}:
%
\begin{verbatim}
cat [impute filename] | awk -v s=[number of samples/individuals]
'{ printf $2 "," $4 "," $5; for(i=1; i<=s; i++) \
printf "," $(i*3+3)*2+$(i*3+4); printf "\n" }'
> [bimbam filename]
\end{verbatim}
%
Notice that one may need to manually input the two quote symbols '
. Depending on the terminal, a direct copy/paste of the above line may
result in ``-bash: syntax error near unexpected token `(' " errors.
Finally, the mean genotype file can accommodate values other than SNP
genotypes. One can use the ``-notsnp" option to disable the minor
allele frequency cutoff and to use any numerical values as covariates.
\subsubsection{Phenotype File}
This file contains phenotype information. Each line is a number
indicating the phenotype value for each individual in turn, in the
same order as in the mean genotype file. Notice that only numeric
values are allowed and characters will not be recognized by the
software. Missing phenotype information is denoted as NA. The number
of rows should be equal to the number of individuals in the mean
genotype file. An example phenotype file with five individuals and one
phenotype is as follows:
%
\begin{verbatim}
1.2
NA
2.7
-0.2
3.3
\end{verbatim}
%
One can include multiple phenotypes as multiple columns in the
phenotype file, and specify a different column for association tests
by using ``-n [num]", where ``-n 1" uses the original first column as
phenotypes, and ``-n 2" uses the second column, and so on and so
forth. An example phenotype file with five individuals and three
phenotypes is as follows:
%
\begin{verbatim}
1.2 -0.3 -1.5
NA 1.5 0.3
2.7 1.1 NA
-0.2 -0.7 0.8
3.3 2.4 2.1
\end{verbatim}
%
For binary traits, one is recommended to label controls as 0 and cases
as 1, as the results will have better interpretation. For example, the
predicted values from a linear BSLMM can be directly interpreted as
the probability of being a case. In addition, the probit BSLMM will
only recognize 0/1 as control/case labels.
For prediction problems, one is recommended to list all individuals in
the file, but label those individuals in the test set as missing. This
will facilitate the use of the prediction function implemented in
GEMMA.
\subsubsection{SNP Annotation File (optional)}
This file contains SNP information. The first column is SNP id, the
second column is its base-pair position, and the third column is its
chromosome number. The rows are not required to be in the same order
of the mean genotype file, but must contain all SNPs in that file. An
example annotation file with four SNPs is as follows:
%
\begin{verbatim}
rs1, 1200, 1
rs2, 1000, 1
rs3, 3320, 1
rs4, 5430, 1
\end{verbatim}
%
If an annotation file is not provided, the SNP information columns in
the output file for association tests will have ``-9" as missing
values.
\subsection{Relatedness Matrix File Format}
GEMMA, as a linear mixed model software, requires a relatedness matrix
file in addition to both genotype and phenotype files. The relatedness
matrix can be supplied in two different ways: either use the original
relatedness matrix, or use the eigen values and eigen vectors of the
original relatedness matrix.
\subsubsection{Original Matrix Format}
GEMMA takes the original relatedness matrix file in two formats. The
first format is a $n\times n$ matrix, where each row and each column
corresponds to individuals in the same order as in the *.fam file or
in the mean genotype file, and $i$th row and $j$th column is a number
indicating the relatedness value between $i$th and $j$th
individuals. An example relatedness matrix file with three individuals
is as follows:
%
\begin{verbatim}
0.3345 -0.0227 0.0103
-0.0227 0.3032 -0.0253
0.0103 -0.0253 0.3531
\end{verbatim}
%
The second relatedness matrix format is a three column ``id id value"
format, where the first two columns show two individual id numbers,
and the third column shows the relatedness value between these two
individuals. Individual ids are not required to be in the same order
as in the *.fam file, and relatedness values not listed in the
relatedness matrix file will be considered as 0. An example
relatedness matrix file with the same three individuals above is shown
below:
%
\begin{verbatim}
id1 id1 0.3345
id1 id2 -0.0227
id1 id3 0.0103
id2 id2 0.3032
id2 id3 -0.0253
id3 id3 0.3531
\end{verbatim}
%
As BIMBAM mean genotype files do not provide individual id, the second
format only works with the PLINK binary ped format. One can use ``-km
[num]" to choose which format to use, i.e. use ``-km 1" or ``-km 2" to
accompany PLINK binary ped format, and use ``-km 1" to accompany
BIMBAM format.
\subsubsection{Eigen Value and Eigen Vector Format}
GEMMA can also read the relatedness matrix in its decomposed forms. To
do this, one should supply two files instead of one: one file
containing the eigen values and the other file containing the
corresponding eigen vectors. The eigen value file contains one column
of $n_{a}$ elements, with each element corresponds to an eigen
value. The eigen vector file contains a $n_a\times n_a$ matrix, with
each column corresponds to an eigen vector. The eigen vector in the
$i$th column of the eigen vector file should correspond to the eigen
value in the $i$th row of the eigen value file. Both files can be
generated from the original relatedness matrix file by using the
``-eigen " option in GEMMA. Notice that $n_a$ represents the number of
analyzed individuals, which may be smaller than the number of total
individuals $n$.
\subsection{Covariates File Format (optional)}
One can provide a covariates file if needed for fitting LMM if
necessary. GEMMA fits a linear mixed model with an intercept term if
no covariates file is provided, but does not internally provide an
intercept term if a covariates file is available. Therefore, if one
has covariates other than the intercept and wants to adjust for those
covariates ($\mathbf W$) simultaneously, one should provide GEMMA with
a covariates file containing an intercept term explicitly. The
covariates file is similar to the above BIMBAM multiple phenotype
file, and must contain a column of $1$'s if one wants to include an
intercept. An example covariates file with five individuals and three
covariates (the first column is the intercept) is as follows:
%
\begin{verbatim}
1 1 -1.5
1 2 0.3
1 2 0.6
1 1 -0.8
1 1 2.0
\end{verbatim}
%
It can happen, especially in a small GWAS data set, that some of the
covariates will be identical to some of the genotypes (up to a scaling
factor). This can cause problems in the optimization algorithm and
evoke GSL errors. To avoid this, one can either regress the phenotypes
on the covariates and use the residuals as new phenotypes, or use only
SNPs that are not identical to any of the covariates for the
analysis. The later can be achieved, for example, by performing a
standard linear regression in the genotype data, but with covariates
as phenotypes.
\subsection{Beta/Z File}
This file contains marginal z-scores from the study. The first row is
a header line. The first column is the SNP id, the second column is
number of total SNPs, the third column is the marginal z-score, the
fourth and fifth columns are the SNP alleles. The SNPs are not
required to be in the same order of the other files. An example
category file with four SNPs is as follows:
%
\begin{verbatim}
SNP N Z INC_ALLELE DEC_ALLELE
rs1 1200 -0.322165 A T
rs2 1000 -0.343634 G T
rs3 3320 -0.338341 A T
rs4 5430 -0.322820 T C
\end{verbatim}
%
This file is flexible. You can use beta and se\_beta columns instead
of marginal z-scores. You can directly use the output *.assoc.txt file
from the a linear model analysis as the input beta/z file.
\subsection{Category File}
This file contains SNP category information. The first row is a header
line. The first column is chromosome number (optional), the second
column is base pair position (optional), the third column is SNP id,
the fourth column is its genetic distance on the chromosome
(optional), and the following columns list non-overlapping
categories. A vector of indicators is provided for each SNP. The SNPs
are not required to be in the same order of the other files. An
example category file with four SNPs is as follows:
%
\begin{verbatim}
CHR BP SNP CM CODING UTR PROMOTER DHS INTRON ELSE
1 1200 rs1 0.428408 1 0 0 0 0 0
1 1000 rs2 0.743268 0 0 0 0 0 1
1 3320 rs3 0.744197 0 0 1 1 0 0
1 5430 rs4 0.766409 0 0 0 0 0 0
\end{verbatim}
%
In the above file, rs1 belongs to a coding region; rs2 belongs does
not belong to any of the first five categories; rs3 belongs to both
promoter and DHS regions but will be treated as an DHS snp in the
analysis; rs4 does not belong to any category and will be ignored in
the analysis. Note that if a SNP is labeled with more than one
category, then it will be treated as the last category label.
This file is also flexible, as long as it contains the SNP id and the
category information.
\subsection{LD Score File}
This file contains the LD scores for all SNPs. The first row is a
header line. The first column is chromosome number (optional), the
second column is SNP id, the third column is base pair position
(optional), the fourth column is the LD score of the SNP. An example
LD score file with four SNPs is as follows:
%
\begin{verbatim}
CHR SNP BP L2
1 rs1 1200 1.004
1 rs2 1000 1.052
1 rs3 3320 0.974
1 rs4 5430 0.986
\end{verbatim}
%
In the above file, the LD score for rs1 is 1.004 and the LD score for
rs4 is 0.986.
This file is also flexible, as long as it contains the SNP id and the
LD score information.
\newpage
\section{Running GEMMA}
\subsection{A Small GWAS Example Dataset}
If you downloaded the GEMMA source code recently, you will find an
``example" folder containing a small GWAS example dataset. This data
set comes from the heterogeneous stock mice data, kindly provided by
Wellcome Trust Centre for Human Genetics on the public domain
\url{http://mus.well.ox.ac.uk/mouse/HS/}, with detailed described in
\cite{Valdar:2006}.
The data set consists of 1904 individuals from 85 families, all
descended from eight inbred progenitor strains. We selected two
phenotypes from this data set: the percentage of CD8+ cells, with
measurements in 1410 individuals; mean corpuscular hemoglobin (MCH),
with measurements in 1580 individuals. A total of 1197 individuals
have both phenotypes. The phenotypes were already corrected for sex,
age, body weight, season and year effects by the original study, and
we further quantile normalized the phenotypes to a standard normal
distribution. In addition, we obtained a total of 12,226 autosomal
SNPs, with missing genotypes replaced by the mean genotype of that SNP
in the family. Genotype and phenotype files are in both BIMBAM and
PLINK binary formats.
For demonstration purpose, for CD8, we randomly divided the 85
families into two sets, where each set contained roughly half of the
individuals (i.e. {\it inter-family} split) as in \cite{Zhou:2013}. We
also created artificial binary phenotypes from the quantitative
phenotypes CD8, by assigning the half individuals with higher
quantitative values to 1 and the other half to 0, as in
\cite{Zhou:2013}. Therefore, the phenotype files contain six columns
of phenotypes. The first column contains the quantitative phenotypes
CD8 for all individuals. The second column contains quantitative
phenotypes CD8 for individuals in the training set. The third column
contains quantitative phenotypes CD8 for individuals in the test
set. The fourth and fifth columns contain binary phenotypes CD8 for
individuals in the training set and test set, respectively. The sixth
column contains the quantitative phenotypes MCH for all individuals.
A demo.txt file inside the same folder shows detailed steps on how to
use GEMMA to estimate the relatedness matrix from genotypes, how to
perform association tests using both the univariate linear mixed model
and the multivariate linear mixed model, how to fit the Bayesian
sparse linear mixed model and how to obtain predicted values using the
output files from fitting BSLMM. The output results from GEMMA for all
the examples are also available inside the ``result" subfolder.
\subsection{SNP filters}
The are a few SNP filters implemented in the software.
\begin{itemize}
\item Polymorphism. Non-polymorphic SNPs will not be included in the
analysis.
\item Missingness. By default, SNPs with missingness above 5\% will
not be included in the analysis. Use ``-miss [num]'' to change. For
example, ``-miss 0.1'' changes the threshold to 10\%. With
``-miss 1.0'' the filter is disabled.
\item Minor allele frequency. By default, SNPs with minor allele
frequency below 1\% will not be included in the analysis. Use ``-maf
[num]" to change. For example, ``-maf 0.05'' changes the threshold
to 5\%. With ``-notsnp'' the filter is disabled.
\item Correlation with any covariate. By default, SNPs with $r^2$
correlation with any of the covariates above 0.9999 will not be
included in the analysis. Use ``-r2 [num]'' to change. For example,
``-r2 0.999999'' changes the threshold to 0.999999. With ``-r2
1.0'' the filter is disabled.
\item Hardy-Weinberg equilibrium. Use ``-hwe [num]'' to specify. For
example, ``-hwe 0.001'' will filter out SNPs with Hardy-Weinberg $p$
values below 0.001. With ``-hwe 0'' or ``--notsnp'' the filter is
disabled.
\item User-defined SNP list. Use ``-snps [filename]'' to specify a
list of SNPs to be included in the analysis.
\end{itemize}
Calculations of the above filtering thresholds are based on analyzed
individuals (i.e. individuals with no missing phenotypes and no
missing covariates). Therefore, if all individuals have missing
phenotypes, no SNP will be analyzed and the output matrix will be full
of ``nan"s.
\subsection{Association Tests with a Linear Model}
\subsubsection{Basic Usage}
The basic usages for linear model association analysis with either the PLINK binary ped format or the BIMBAM format are:
\begin{verbatim}
./gemma -bfile [prefix] -lm [num] -o [prefix]
./gemma -g [filename] -p [filename] -a [filename] -lm [num] -o [prefix]
\end{verbatim}
where the ``-lm [num]" option specifies which frequentist test to use,
i.e. ``-lm 1" performs Wald test, ``-lm 2" performs likelihood ratio
test, ``-lm 3" performs score test, and ``-lm 4" performs all the
three tests; ``-bfile [prefix]" specifies PLINK binary ped file
prefix; ``-g [filename]" specifies BIMBAM mean genotype file name;
``-p [filename]" specifies BIMBAM phenotype file name; ``-a
[filename]" (optional) specifies BIMBAM SNP annotation file name; ``-o
[prefix]" specifies output file prefix.
Notice that different from a linear mixed model, this analysis does
not require a relatedness matrix.
\subsubsection{Detailed Information}
For binary traits, one can label controls as 0 and cases as 1, and
follow our previous approaches to fit the data with a linear mixed
model by treating the binary case control labels as quantitative
traits \cite{Zhou:2012, Zhou:2013}. This approach can be justified
partly by recognizing the linear model as a first order Taylor
approximation to a generalized linear model, and partly by the
robustness of the linear model to model misspecification
\cite{Zhou:2013}.
\subsubsection{Output Files}
There will be two output files, both inside an output folder in the
current directory. The prefix.log.txt file contains some detailed
information about the running parameters and computation time. In
addition, prefix.log.txt contains PVE estimate and its standard error
in the null linear mixed model.
The prefix.assoc.txt contains the results. An example file with a few
SNPs is shown below:
%
\begin{verbatim}
chr rs ps n_mis n_obs allele1 allele0 af beta se p_wald
1 rs3683945 3197400 0 1410 A G 0.443 -1.586575e-01 3.854542e-02 4.076703e-05
1 rs3707673 3407393 0 1410 G A 0.443 -1.563903e-01 3.855200e-02 5.252187e-05
1 rs6269442 3492195 0 1410 A G 0.365 -2.349908e-01 3.905200e-02 2.256622e-09
1 rs6336442 3580634 0 1410 A G 0.443 -1.566721e-01 3.857380e-02 5.141944e-05
1 rs13475700 4098402 0 1410 A C 0.127 2.209575e-01 5.644804e-02 9.497902e-05
\end{verbatim}
%
The 11 columns are: chromosome numbers, snp ids, base pair positions
on the chromosome, number of missing individuals for a given snp,
number of non-missing individuals for a given snp, minor allele, major
allele, allele frequency, beta estimates (additive effect), standard
errors for beta, and $p$ values from the Wald test.
\subsection{Estimate Relatedness Matrix from Genotypes}
\subsubsection{Basic Usage}
The basic usages to calculate an estimated relatedness matrix with
either the PLINK binary ped format or the BIMBAM format are:
%
\begin{verbatim}
./gemma -bfile [prefix] -gk [num] -o [prefix]
./gemma -g [filename] -p [filename] -gk [num] -o [prefix]
\end{verbatim}
%
where the ``-gk [num]" option specifies which relatedness matrix to
estimate, i.e. ``-gk 1" calculates the centered relatedness matrix
while ``-gk 2" calculates the standardized relatedness matrix;
``-bfile [prefix]" specifies PLINK binary ped file prefix; ``-g
[filename]" specifies BIMBAM mean genotype file name; ``-p [filename]"
specifies BIMBAM phenotype file name; ``-o [prefix]" specifies output
file prefix.
Notice that the BIMBAM mean genotype file can be provided in a gzip
compressed format.
\subsubsection{Detailed Information}
GEMMA provides two ways to estimate the relatedness matrix from
genotypes, using either the centered genotypes or the standardized
genotypes. We denote $\mathbf X$ as the $n\times p$ matrix of
genotypes, $\mathbf x_i$ as its $i$th column representing genotypes of
$i$th SNP, $\bar{x_i}$ as the sample mean and $v_{x_i}$ as the sample
variance of $i$th SNP, and $\mathbf 1_n$ as a $n\times 1$ vector of
1's. Then the two relatedness matrices GEMMA can calculate are as
follows:
%
\begin{align*}
G_c&=\frac{1}{p}\sum_{i=1}^p
(\mathbf x_i-\mathbf 1_n \bar{x_i})(\mathbf x_i-\mathbf 1_n \bar{x_i})^T, \\
G_s&=\frac{1}{p}\sum_{i=1}^p
\frac{1}{v_{x_i}}(\mathbf x_i-\mathbf 1_n \bar{x_i})
(\mathbf x_i-\mathbf 1_n \bar{x_i})^T.
\end{align*}
%
Which of the two relatedness matrix to choose will largely depend on
the underlying genetic architecture of the given trait. Specifically,
if SNPs with lower minor allele frequency tend to have larger effects
(which is inversely proportional to its genotype variance), then the
standardized genotype matrix is preferred. If the SNP effect size does
not depend on its minor allele frequency, then the centered genotype
matrix is preferred. In our previous experience based on a limited
examples, we typically find the centered genotype matrix provides
better control for population structure in lower organisms, and the
two matrices seem to perform similarly in humans.
\subsubsection{Output Files}
There will be two output files, both inside an output folder in the
current directory. The prefix.log.txt file contains some detailed
information about the running parameters and computation time, while
the prefix.cXX.txt or prefix.sXX.txt contains a $n\times n$ matrix of
estimated relatedness matrix.
\subsection{Perform Eigen-Decomposition of the Relatedness Matrix}
\subsubsection{Basic Usage}
The basic usages to perform an eigen-decomposition of the relatedness
matrix with either the PLINK binary ped format or the BIMBAM format
are:
%
\begin{verbatim}
./gemma -bfile [prefix] -k [filename] -eigen -o [prefix]
./gemma -g [filename] -p [filename] -k [filename] -eigen -o [prefix]
\end{verbatim}
%
where the ``-bfile [prefix]" specifies PLINK binary ped file prefix;
``-g [filename]" specifies BIMBAM mean genotype file name; ``-p
[filename]" specifies BIMBAM phenotype file name; ``-k [filename]"
specifies the relatedness matrix file name; ``-o [prefix]" specifies
output file prefix.
Notice that the BIMBAM mean genotype file and/or the relatedness
matrix file can be provided in a gzip compressed format.
\subsubsection{Detailed Information}
GEMMA extracts the matrix elements corresponding to the analyzed
individuals (which may be smaller than the number of total
individuals), center the matrix, and then perform an
eigen-decomposition.
\subsubsection{Output Files}
There will be three output files, all inside an output folder in the
current directory. The prefix.log.txt file contains some detailed
information about the running parameters and computation time, while
the prefix.eigenD.txt and prefix.eigenU.txt contain the eigen values
and eigen vectors of the estimated relatedness matrix, respectively.
\subsection{Association Tests with Univariate Linear Mixed Models}
\subsubsection{Basic Usage}
The basic usages for association analysis with either the PLINK binary
ped format or the BIMBAM format are:
\begin{verbatim}
./gemma -bfile [prefix] -k [filename] -lmm [num] -o [prefix]
./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -o [prefix]
\end{verbatim}
where the ``-lmm [num]" option specifies which frequentist test to
use, i.e. ``-lmm 1" performs Wald test, ``-lmm 2" performs likelihood
ratio test, ``-lmm 3" performs score test, and ``-lmm 4" performs all
the three tests; ``-bfile [prefix]" specifies PLINK binary ped file
prefix; ``-g [filename]" specifies BIMBAM mean genotype file name;
``-p [filename]" specifies BIMBAM phenotype file name; ``-a
[filename]" (optional) specifies BIMBAM SNP annotation file name; ``-k
[filename]" specifies relatedness matrix file name; ``-o [prefix]"
specifies output file prefix.
To detect gene environmental interactions, you can add "-gxe
[filename]". This gxe file contains a column of environmental
variables. In this case, for each SNP in turn, GEMMA will fit a linear
mixed model that controls both the SNP main effect and environmental
main effect, while testing for the interaction effect.
Notice that ``-k [filename]" could be replaced by ``-d [filename]" and
``-u [filename]", where ``-d [filename]" specifies the eigen value
file and ``-u [filename]" specifies the eigen vector file. The BIMBAM
mean genotype file and/or the relatedness matrix file (or the eigen
vector file) can be provided in a gzip compressed format.
\subsubsection{Detailed Information}
The algorithm calculates either REML or MLE estimate of $\lambda$ in
the evaluation interval from $1\times 10^{-5}$ (corresponding to
almost pure environmental effect) to $1\times 10^5$ (corresponding to
almost pure genetic effect). Although unnecessary in most cases, one
can expand or reduce this evaluation interval by specifying ``-lmin"
and ``-lmax" (e.g. ``-lmin 0.01 -lmax 100" specifies an interval from
$0.01$ to $100$). The log-scale evaluation interval is further divided
into 10 equally spaced regions, and optimization is carried out in
each region where the first derivatives change sign. Although also
unnecessary in most cases, one can increase or decrease the number of
regions by using ``-region" (e.g. ``-region 100" uses 100 equally
spaced regions on the log-scale), which may yield more stable or
faster performance, respectively.
For binary traits, one can label controls as 0 and cases as 1, and
follow our previous approaches to fit the data with a linear mixed
model by treating the binary case control labels as quantitative
traits \cite{Zhou:2012, Zhou:2013}. This approach can be justified
partly by recognizing the linear model as a first order Taylor
approximation to a generalized linear model, and partly by the
robustness of the linear model to model misspecification
\cite{Zhou:2013}.
\subsubsection{Output Files}
There will be two output files, both inside an output folder in the
current directory. The prefix.log.txt file contains some detailed
information about the running parameters and computation time. In
addition, prefix.log.txt contains PVE estimate and its standard error
in the null linear mixed model. Here is an example log file in which
an additional covariates file is also provided:
%
\begin{verbatim}
## Summary Statistics:
## number of total individuals = 1940
## number of analyzed individuals = 1197
## number of covariates = 2
## number of phenotypes = 1
## number of total SNPs = 12226
## number of analyzed SNPs = 10758
## REMLE log-likelihood in the null model = -1355.27
## MLE log-likelihood in the null model = -1356.45
## pve estimate in the null model = 0.598772
## se(pve) in the null model = 0.0356942
## vg estimate in the null model = 1.42894
## ve estimate in the null model = 0.344439
## beta estimate in the null model = 0.00735198 0.0425024
## se(beta) = 0.0169633 0.025582
\end{verbatim}
%
In this example, the ``null model'' in which none of the 10,758
analyzed SNPs have an effect on phenotype, explains about 60\% of the
variance in the phenotype residuals (with standard error of 3.6\%)
after removing linear effects of the two covariates. The genetic and
environmental variance components of the residuals are 1.43 and 0.34,
respectively. The last two values are the regression coefficients for
the covariates in the fitted linear mixed model, with standard
errors. The first number (0.007) is the estimate of the intercept
because the first column in the covariates file is a column of ones.
The prefix.assoc.txt contains the results. An example file with a few
SNPs is shown below:
%
\begin{verbatim}
chr rs ps n_miss allele1 allele0 af beta se l_remle p_wald
1 rs3683945 3197400 0 A G 0.443 -7.788665e-02 6.193502e-02 4.317993e+00 2.087616e-01
1 rs3707673 3407393 0 G A 0.443 -6.654282e-02 6.210234e-02 4.316144e+00 2.841271e-01
1 rs6269442 3492195 0 A G 0.365 -5.344241e-02 5.377464e-02 4.323611e+00 3.204804e-01
1 rs6336442 3580634 0 A G 0.443 -6.770154e-02 6.209267e-02 4.315713e+00 2.757541e-01
1 rs13475700 4098402 0 A C 0.127 -5.659089e-02 7.175374e-02 4.340145e+00 4.304306e-01
\end{verbatim}
%
The 11 columns are: chromosome numbers, snp ids, base pair positions
on the chromosome, number of missing values for a given snp, minor
allele, major allele, allele frequency, beta estimates, standard
errors for beta, remle estimates for lambda, and $p$ values from Wald
test.
\subsection{Association Tests with Multivariate Linear Mixed Models}
\subsubsection{Basic Usage}
The basic usages for association analysis with either the PLINK binary
ped format or the BIMBAM format are:
\begin{verbatim}
./gemma -bfile [prefix] -k [filename] -lmm [num] -n [num1] [num2] [num3] -o [prefix]
./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num]
-n [num1] [num2] [num3] -o [prefix]
\end{verbatim}
This is identical to the above univariate linear mixed model
association test, except that an "-n " option is employed to specify
which phenotypes in the phenotype file are used for association tests.
(The values after the ``-n " option should be separated by a space.)
To detect gene environmental interactions, you can add "-gxe
[filename]". This gxe file contains a column of environmental
variables. In this case, for each SNP in turn, GEMMA will fit a linear
mixed model that controls both the SNP main effect and environmental
main effect, while testing for the interaction effect.
Notice that ``-k [filename]" could be replaced by ``-d [filename]" and
``-u [filename]", where ``-d [filename]" specifies the eigen value
file and ``-u [filename]" specifies the eigen vector file. The BIMBAM
mean genotype file and/or the relatedness matrix file (or the eigen
vector file) can be provided in a gzip compressed format.
\subsubsection{Detailed Information}
Although the number of phenotypes used for analysis can be arbitrary,
it is highly recommended to restrict the number of phenotypes to be
small, say, less than ten.
In addition, when a small proportion of phenotypes are partially
missing, one can impute these missing values before association tests:
\begin{verbatim}
./gemma -bfile [prefix] -k [filename] -predict -n [num1] [num2] [num3] -o [prefix]
./gemma -g [filename] -p [filename] -a [filename] -k [filename] -predict
-n [num1] [num2] [num3] -o [prefix]
\end{verbatim}
\subsubsection{Output Files}
There will be two output files, both inside an output folder in the
current directory. The prefix.log.txt file contains some detailed
information about the running parameters and computation time. In
addition, prefix.log.txt contains genetic correlations estimates and
their standard errors in the null multivariate linear mixed model.
The prefix.assoc.txt contains the results, and is in a very similar
format as the result file from the univariate association tests. The
number of columns will depend on the number of phenotypes used for
analysis. The first few columns are: chromosome numbers, snp ids, base
pair positions on the chromosome, number of missing values for a given
snp, minor allele, major allele and allele frequency. The last column
contains $p$ values from the association tests. The middle columns
contain beta estimates and the variance matrix for these estimates.
\subsection{Fit a Bayesian Sparse Linear Mixed Model}
\subsubsection{Basic Usage}
The basic usages for fitting a BSLMM with either the PLINK binary ped
format or the BIMBAM format are:
\begin{verbatim}
./gemma -bfile [prefix] -bslmm [num] -o [prefix]
./gemma -g [filename] -p [filename] -a [filename] -bslmm [num] -o [prefix]
\end{verbatim}
where the ``-bslmm [num]" option specifies which model to fit,
i.e. ``-bslmm 1" fits a standard linear BSLMM, ``-bslmm 2" fits a
ridge regression/GBLUP, and ``-bslmm 3" fits a probit BSLMM; ``-bfile
[prefix]" specifies PLINK binary ped file prefix; ``-g [filename]"
specifies BIMBAM mean genotype file name; ``-p [filename]" specifies
BIMBAM phenotype file name; ``-a [filename]" (optional) specifies
BIMBAM SNP annotation file name; ``-o [prefix]" specifies output file
prefix.
Notice that the BIMBAM mean genotype file can be provided in a gzip
compressed format.
\subsubsection{Detailed Information}
Notice that a large memory is needed to fit BSLMM (e.g. may need 20 GB
for a data set with 4000 individuals and 400,000 SNPs), because the
software has to store the whole genotype matrix in the physical
memory.
In default, GEMMA does not require the user to provide a relatedness
matrix explicitly. It internally calculates and uses the centered
relatedness matrix, which has the nice interpretation that each effect
size $\beta_i$ follows a mixture of two normal distributions {\it a
priori}. Of course, one can choose to supply a relatedness matrix by
using the ``-k [filename]" option. In addition, GEMMA does not take
covariates file when fitting BSLMM. However, one can use the BIMBAM
mean genotype file to store these covariates and use ``-notsnp" option
to use them.
The option ``-bslmm 1" fits a linear BSLMM using MCMC, ``-bslmm 2"
fits a ridge regression/GBLUP with standard non-MCMC method, and
``-bslmm 3" fits a probit BSLMM using MCMC. Therefore, option ``-bslmm
2" is much faster than the other two options, and option ``-bslmm 1"
is faster than ``-bslmm 3". For MCMC based methods, one can use ``-w
[num]" to choose the number of burn-in iterations that will be
discarded, and ``-s [num]" to choose the number of sampling iterations
that will be saved. In addition, one can use ``-smax [num]" to choose
the number of maximum SNPs to include in the model (i.e. SNPs that
have addition effects), which may also be needed for the probit BSLMM
because of its heavier computationAL burden. It is up to the users to
decide these values for their own data sets, in order to balance
computation time and computation accuracy.
For binary traits, one can label controls as 0 and cases as 1, and
follow our previous approach to fit the data with a linear BSLMM by
treating the binary case control labels as quantitative traits
\cite{Zhou:2013}. This approach can be justified by recognizing the
linear model as a first order Taylor approximation to a generalized
linear model. One can of course choose to fit a probit BSLMM, but in
our limited experience, we do not find appreciable prediction accuracy
gains in using the probit BSLMM over the linear BSLMM for binary
traits (see a briefly discussion in \cite{Zhou:2013}). This of course
could be different for a different data set.
The genotypes, phenotypes (except for the probit BSLMM), as well as
the relatedness matrix will be centered when fitting the models. The
estimated values in the output files are thus for these centered
values. Therefore, proper prediction will require genotype means and
phenotype means from the individuals in the training set, and one
should always use the same phenotype file (and the same phenotype
column) and the same genotype file, with individuals in the test set
labeled as missing, to fit the BSLMM and to obtain predicted values
described in the next section.
\subsubsection{Output Files}
There will be five output files, all inside an output folder in the
current directory. The prefix.log.txt file contains some detailed
information about the running parameters and computation time. In
addition, prefix.log.txt contains PVE estimate and its standard error
in the null linear mixed model (not the BSLMM).
The prefix.hyp.txt contains the posterior samples for the
hyper-parameters ($h$, PVE, $\rho$, PGE, $\pi$ and $|\gamma|$), for
every $10$th iteration. An example file with a few SNPs is shown
below:
%
\begin{verbatim}
h pve rho pge pi n_gamma
4.777635e-01 5.829042e-01 4.181280e-01 4.327976e-01 2.106763e-03 25
5.278073e-01 5.667885e-01 3.339020e-01 4.411859e-01 2.084355e-03 26
5.278073e-01 5.667885e-01 3.339020e-01 4.411859e-01 2.084355e-03 26
6.361674e-01 6.461678e-01 3.130355e-01 3.659850e-01 2.188401e-03 25
5.479237e-01 6.228036e-01 3.231856e-01 4.326231e-01 2.164183e-03 27
\end{verbatim}
%
The prefix.param.txt contains the posterior mean estimates for the
effect size parameters ($\alpha$, $\beta | \gamma==1$ and
$\gamma$). An example file with a few SNPs is shown below:
%
\begin{verbatim}
chr rs ps n_miss alpha beta gamma
1 rs3683945 3197400 0 -7.314495e-05 0.000000e+00 0.000000e+00
1 rs3707673 3407393 0 -7.314495e-05 0.000000e+00 0.000000e+00
1 rs6269442 3492195 0 -3.412974e-04 0.000000e+00 0.000000e+00
1 rs6336442 3580634 0 -8.051198e-05 0.000000e+00 0.000000e+00
1 rs13475700 4098402 0 -1.200246e-03 0.000000e+00 0.000000e+00
\end{verbatim}
%
Notice that the beta column contains the posterior mean estimate for
$\beta_i | \gamma_i==1$ rather than $\beta_i$. Therefore, the effect
size estimate for the additional effect is $\beta_i\gamma_i$, and in
the special case $\bK=\bX\bX^T/p$, the total effect size estimate is
$\alpha_i+\beta_i\gamma_i$.
The prefix.bv.txt contains a column of breeding value estimates $\hat
\bu$. Individuals with missing phenotypes will have values of ``NA".
The prefix.gamma.txt contains the posterior samples for the gamma, for
every $10$th iteration. Each row lists the SNPs included in the model
in that iteration, and those SNPs are represented by their row numbers
(+1) in the prefix.param.txt file.
\subsection{Predict Phenotypes Using Output from BSLMM}
\subsubsection{Basic Usage}
The basic usages for association analysis with either the PLINK binary
ped format or the BIMBAM format are:
\begin{verbatim}
./gemma -bfile [prefix] -epm [filename] -emu [filename] -ebv [filename] -k [filename]
-predict [num] -o [prefix]
./gemma -g [filename] -p [filename] -epm [filename] -emu [filename] -ebv [filename]
-k [filename] -predict [num] -o [prefix]
\end{verbatim}
where the ``-predict [num]" option specifies where the predicted
values need additional transformation with the normal cumulative
distribution function (CDF), i.e. ``-predict 1" obtains predicted
values, ``-predict 2" obtains predicted values and then transform them
using the normal CDF to probability scale; ``-bfile [prefix]"
specifies PLINK binary ped file prefix; ``-g [filename]" specifies
BIMBAM mean genotype file name; ``-p [filename]" specifies BIMBAM
phenotype file name; ``-epm [filename]" specifies the output estimated
parameter file (i.e. prefix.param.txt file from BSLMM); ``-emu
[filename]" specifies the output log file which contains the estimated
mean (i.e. prefix.log.txt file from BSLMM); ``-ebv [filename]"
specifies the output estimated breeding value file (i.e. prefix.bv.txt
file from BSLMM); ``-k [filename]" specifies relatedness matrix file
name; ``-o [prefix]" specifies output file prefix.
\subsubsection{Detailed Information}
GEMMA will obtain predicted values for individuals with missing
phenotype, and this process will require genotype means and phenotype
means from the individuals in the training set. Therefore, use the
same phenotype file (and the same phenotype column) and the same
genotype file, as used in fitting BSLMM.
There are two ways to obtain predicted values: use $\hat \bu$ and
$\hat\bbeta$, or use $\hat\balpha$ and $\hat\bbeta$. We note that
$\hat\bu$ and $\hat\balpha$ are estimated in slightly different ways,
and so even in the special case $\bK=\bX\bX^T/p$, $\hat\bu$ may not
equal to $\bX\hat\balpha$. However, in this special case, these two
approaches typically give similar results based on our previous
experience. Therefore, if one used the default matrix in fitting the
BSLMM, then it may not be necessary to supply ``-ebv [filename]" and
``-k [filename]" options, and GEMMA can use only the estimated
parameter file and log file to obtain predicted values by the second
approach. But of course, one can choose to use the first approach
which is more formal, and when do so, one needs to calculate the
centered matrix based on the same phenotype column used in BSLMM
(i.e. to use only SNPs that were used in the fitting). On the other
hand, if one did not use the default matrix in fitting the BSLMM, then
one needs to supply the same relatedness matrix here again.
The option ``-predict 2" should only be used when a probit BSLMM was
used to fit the data. In particular, for binary phenotypes, if one
fitted the linear BSLMM then one should use the option``-predict 1",
and use option ``-predict 2" only if one fitted the data with the
probit BSLMM.
Here, unlike in previous sections, all SNPs that have estimated effect
sizes will be used to obtain predicted values, regardless of their
minor allele frequency and missingness. SNPs with missing values will
be imputed by the mean genotype of that SNP in the training data set.
\subsubsection{Output Files}
There will be two output files, both inside an output folder in the
current directory. The prefix.log.txt file contains some detailed
information about the running parameters and computation time, while
the prefix.prdt.txt contains a column of predicted values for all
individuals. In particular, individuals with missing phenotypes will
have predicted values, while individuals with non-missing phenotypes
will have ``NA"s.
\subsection{Variance Component Estimation with Relatedness Matrices}
\subsubsection{Basic Usage}
The basic usages for variance component estimation with relatedness
matrices are:
\begin{verbatim}
./gemma -p [filename] -k [filename] -n [num] -vc [num] -o [prefix]
./gemma -p [filename] -mk [filename] -n [num] -vc [num] -o [prefix]
\end{verbatim}
where the ``-vc [num]" option specifies which estimation to use, in
particular, "-vc 1" (default) uses HE regression and "-vc 2" uses REML
AI algorithm; ``-p [filename]" specifies phenotype file name; ``-n
[num]" (default 1) specifies which column of phenotype to use
(e.g. one can use "-n 6" for a fam file); ``-k [filename]" specifies
relatedness matrix file name; ``-mk [filename]" specifies the multiple
relatedness matrix file name; the multiple relatedness matrix file is
a text file where each row contains the full path to the relatedness
matrices; ``-o [prefix]" specifies output file prefix.
The relatedness matrix file can be provided in a gzip compressed format.
\subsubsection{Detailed Information}
By default, the variance component estimates from the REML AI
algorithm are constrained to be positive. To allow for unbiased
estimates, one can use "-noconstrain" to pair with "-vc 2". The
estimates from the HE regression are not constrained.
For binary traits, one can label controls as 0 and cases as 1, and
follow our previous approaches to fit the data with a linear mixed
model by treating the binary case control labels as quantitative
traits \cite{Zhou:2013}. A scaling factor can be used to transform
variance component estimates from the observed scale back to liability
scale \cite{Zhou:2013}.
\subsubsection{Output Files}
One output file will be generated inside an output folder in the
current directory. This prefix.log.txt file contains detailed
information about the running parameters, computation time, as well as
the variance component/PVE estimates and their standard errors.
\subsection{Variance Component Estimation with Summary Statistics}
\subsubsection{Basic Usage}
This analysis option requires marginal z-scores from the study and
individual-level genotypes from a random subset of the study (or a
separate reference panel). The marginal z-scores are provided in a
beta file while the genotypes can be provided either in the PLINK
binary ped format or the BIMBAM format. The basic usages for variance
component estimation with summary statistics are:
\begin{verbatim}
./gemma -beta [filename] -bfile [prefix] -vc 1 -o [prefix]
./gemma -beta [filename] -g [filename] -p [filename] -a [filename] -vc 1 -o [prefix]
\end{verbatim}
where the ``-vc 1" option specifies to use MQS-HEW; ``-beta
[filename]" specifies beta file name; ``-bfile [prefix]" specifies
PLINK binary ped file prefix; ``-g [filename]" specifies BIMBAM mean
genotype file name; ``-p [filename]" specifies BIMBAM phenotype file
name; ``-a [filename]" (optional) specifies BIMBAM SNP annotation file
name; ``-o [prefix]" specifies output file prefix. Note that the
phenotypes in the phenotype file are not used in this analysis and are
only for selecting individuals. The use of phenotypes is different
from the CI1 method detailed in the next section.
To fit a multiple variance component model, you will need to add "-cat
[filename]" to provide the SNP category file that classifies SNPs into
different non-overlapping categories.
The beta file and genotype file can be provided in a gzip compressed
format. In addition, to fit MQS-LDW, you will need to add "-wcat
[filename]" together with "-vc 2". The "-wcat [filename]" option
specifies the LD score file, which can be provided in a gzip
compressed format.
A feature of MQS based variance component estimation is that one only need to use a subset of samples to estimate certain quantities. Using a subset of samples dramatically improves computation speed while maintaining variance component estimation accuracy. To take this strategy, one can use ``-sample [num]" to use a fixed number of random samples to perform estimation.
Instead of using the genotype data from the study, one can also use genotype data from a reference panel. For example, one can use the genotype data from the 1000 genomes project as the reference. However, any population stratification in the reference panel should be dealt with first. For example, the individuals with European ancestry in the 1000 genomes project come from five subpopulations: CEU, FIN, GBR, IBS, and TSI. MQS computes SNP correlations across all SNP pairs as it should be under the LMM assumption. Therefore, any population stratification in the reference panel would increase the overall SNP correlation estimate, leading to down-ward bias in the final heritability estimate. To address the population stratification in the reference panel, one can include a few dummy variables in the model fitting step as covariates. These covariates represent, for example, the five subpopulations, and are used to effectively center the genotype mean in each subpopulation separately. To do this, one can create a covariate file containing five columns (no header): the first column is all 1 representing the intercept; the second column is 1 for CEU and 0 for others; the third column is 1 for FIN and 0 for others; ...; while the fifth column is 1 for IBS and 0 for others. Afterwards, one can add "-c [filename]" to include this covariate file in the command line.
\subsubsection{Detailed Information}
MQS-LDW uses an iterative procedure to update the variance
components. It will first compute the MQS-HEW estimates and then use
these estimates to update and obtain the MQS-LDW estimates. Therefore,
there will be two outputs in the terminal, but only the final results
are saved in the output file.
By default, the standard errors for the variance component estimates
are computed with the approximate block-wise jackknife method. The
jackknife method works well for unrelated individuals and quantitative
traits. If you are interested in using the aymptotic method that is
validated in all scenarios, you need to provide genotypes and
phenotypes from the study, as well as the output files from the
previous MQS run. The basic usages for using the asymptotic form to
compute the confidence intervals are
\begin{verbatim}
./gemma -beta [filename] -bfile [prefix] -ref [prefix] -pve [num] -ci 1 -o [prefix]
./gemma -beta [filename] -g [filename] -p [filename] -ref [prefix] -pve [num] -ci 1 -o [prefix]
\end{verbatim}
In the above usages, ``-ref [prefix]" specifies the prefix of the
output file (including full path) from the previous MQS fit (e.g. q
and S estimates from the reference genotype files); and ``-pve [num]"
specifies the pve estimates from the previous MQS fit. PLINK format
files can be replaced with BIMBAM mean genotype files. In addition, to
fit MQS-LDW, one can add "-wcat [filename]" together with "-ci 2". The
"-wcat [filename]" option specifies the LD score file, which can be
provided in a gzip compressed format.
The asympototic method requires additional summary statistics besides
marginal z-scores \cite{Zhou:2016}. In the current implementation of
the asymptotic form, we use individual level data from the study to
compute these extra summary statistics internally. Thus, at this
stage, the asymptotic form requires the genotype and phenotype files
for all individuals from the study. In the near future, we will output
these extra summary statistics to facilitate consortium studies and
meta-analysis. We are currently working with some consortium studies
to figure out the best way to output these values and to make the
asymptotic method easier to use.
For binary traits, one can label controls as 0 and cases as 1, and
follow our previous approaches to fit the data with a linear mixed
model by treating the binary case control labels as quantitative
traits \cite{Zhou:2013}. A scaling factor can be used to transform
variance component estimates from the observed scale back to liability
scale \cite{Zhou:2013}.
\subsubsection{Output Files}
Five output files will be generated inside an output folder in the
current directory. The prefix.log.txt file contains detailed
information about the running parameters, computation time, as well as
the variance component/PVE estimates and their standard errors. This
is the main file of interest. The prefix.S.txt file contains S
estimates and their estimated errors. The prefix.q.txt file contains q
estimates. The prefix.Vq.txt file contains the standard errors for
q. The prefix.size.txt file contains the number of SNPs in each
category and the number of individuals in the study.
\clearpage
\newpage
\section{Questions and Answers}
\subsection{How do I use a unique output directory?}
You can use -outdir with gemma as a bash script
\begin{verbatim}
-outdir $(mktemp -d -p $HOME)
\end{verbatim}
makes a unique temp directory where the output is stored, here
relative to \$HOME, but you can take any path.
\subsection{How do I prepare the phenotype file for BSLMM?}
Q: I want to perform a cross validation with my data, and want to fit
BSLMM in the training data set and obtain predicted values for
individuals in the test data set. How should I prepare the phenotype
file?
A: One should always use the same phenotype and genotype files for
both fitting BSLMM and obtaining predicted values. Therefore, one
should combine individuals in the training set and test set into a
single phenotype and genotype file before running GEMMA. Specifically,
in the phenotype file, one should label individuals in the training
set with the true phenotype values, and label individuals in the test
set as missing (e.g. ``NA"). Then, one can fit BSLMM with the files as
BSLMM only uses individuals with non-missing phenotypes
(i.e. individuals in the training set). Afterwards, one can obtain
predicted values using the ``-predict" option on the same files, and
the predicted values will be obtained only for individuals with
missing phenotypes (i.e. individuals in the test set). Notice that the
software will still output ``NA" for individuals with non-missing
phenotypes so that the number of individuals in the output
prefix.prdt.txt file will match the total sample size. Please refer to
the GWAS sample data set and some demo scripts included with the GEMMA
source code for detailed examples.
\clearpage
\newpage
\section{Options}
\textcolor{blue}{File I/O Related Options}
%
\begin{itemize}
\item \textcolor{red}{-bfile [prefix]} \quad specify input plink binary file prefix; require .fam, .bim and .bed files
\item \textcolor{red}{-g [filename]} \quad specify input bimbam mean genotype file name
\item \textcolor{red}{-p [filename]} \quad specify input bimbam phenotype file name
\item \textcolor{red}{-n [num]} \quad specify phenotype column in the phenotype file (default 1); or to specify which phenotypes are used in the mvLMM analysis
\item \textcolor{red}{-a [filename]} \quad specify input bimbam SNPs annotation file name (optional)
\item \textcolor{red}{ -k [filename]} \quad specify input kinship/relatedness matrix file name
\item \textcolor{red}{ -km [num]} \quad specify input kinship/relatedness file type (default 1; valid value 1 or 2).
\item \textcolor{red}{ -d [filename]} \quad specify input eigen value file name
\item \textcolor{red}{ -u [filename]} \quad specify input eigen vector file name
\item \textcolor{red}{ -c [filename] } \quad specify input covariates file name (optional); an intercept term is needed in the covariates file
\item \textcolor{red}{ -widv [filename] } \quad weight file contains a column of positive values to be used as weights for residuals---each weight corresponds to an individual, in which a high weight corresponds to high residual error variance for this individual (similar in format to phenotype file)
\item \textcolor{red}{ -gxe [filename] } \quad specify input environmental covariate file name; this is only used for detecting gene x environmental interactions
\item \textcolor{red}{ -cat [filename] } \quad specify input SNP category file name; this is only used for variance component estimation using summary statistics
\item \textcolor{red}{ -beta [filename] } \quad specify input beta/z file name; this is only used for variance component estimation using summary statistics
\item \textcolor{red}{ -epm [filename] } \quad specify input estimated parameter file name
\item \textcolor{red}{ -en [n1] [n2] [n3] [n4]} \quad specify values for the input estimated parameter file (with a header) (default 2 5 6 7 when no -ebv -k files, and 2 0 6 7 when -ebv and -k files are supplied; n1: rs column number; n2: estimated alpha column number (0 to ignore); n3: estimated beta column number (0 to ignore); n4: estimated gamma column number (0 to ignore).)
\item \textcolor{red}{ -ebv [filename] } \quad specify input estimated random effect (breeding value) file name
\item \textcolor{red}{ -emu [filename] } \quad specify input log file name containing estimated mean
\item \textcolor{red}{ -mu [filename] } \quad specify estimated mean value directly, instead of using -emu file
\item \textcolor{red}{ -snps [filename] } \quad specify input snps file name to only analyze a certain set of snps; contains a column of snp ids
\item \textcolor{red}{ -pace [num]} \quad specify terminal display update pace (default 100000).
\item \textcolor{red}{ -outdir [prefix]} \quad specify output directory path (default ``./output/")
\item \textcolor{red}{ -o [prefix]} \quad specify output file prefix (default ``result")
\item \textcolor{red}{ -outdir [path]} \quad specify output directory path (default ``./output/")
\end{itemize}
%
\textcolor{blue}{SNP Quality Control Options}
%
\begin{itemize}
\item \textcolor{red}{-miss [num] } \quad specify missingness threshold (default 0.05)
\item \textcolor{red}{-maf [num] } \quad specify minor allele frequency threshold (default 0.01)
\item \textcolor{red}{-r2 [num] } \quad specify r-squared threshold (default 0.9999)
\item \textcolor{red}{-hwe [num] } \quad specify HWE test p value threshold (default 0; no test)
\item \textcolor{red}{-notsnp } \quad minor allele frequency cutoff is not used and so all real values can be used as covariates
\end{itemize}
%
\textcolor{blue}{Linear Model Options}
%
\begin{itemize}
\item \textcolor{red}{-lm [num]} \quad specify frequentist analysis choice (default 1; valid value 1-4; 1: Wald test; 2: likelihood ratio test; 3: score test; 4: all 1-3.)
\end{itemize}
%
\textcolor{blue}{Relatedness Matrix Calculation Options}
%
\begin{itemize}
\item \textcolor{red}{-gk [num]} \quad specify which type of kinship/relatedness matrix to generate (default 1; valid value 1-2; 1: centered matrix; 2: standardized matrix.)
\end{itemize}
%
\textcolor{blue}{Eigen Decomposition Options}
%
\begin{itemize}
\item \textcolor{red}{-eigen} \quad specify to perform eigen decomposition of the relatedness matrix
\end{itemize}
%
\textcolor{blue}{Linear Mixed Model Options}
%
\begin{itemize}
\item \textcolor{red}{-lmm [num]} \quad specify frequentist analysis choice (default 1; valid value 1-4; 1: Wald test; 2: likelihood ratio test; 3: score test; 4: all 1-3.)
\item \textcolor{red}{-lmin [num] } \quad specify minimal value for lambda (default 1e-5)
\item \textcolor{red}{-lmax [num] } \quad specify maximum value for lambda (default 1e+5)
\item \textcolor{red}{-region [num]} \quad specify the number of regions used to evaluate lambda (default 10)
\end{itemize}
%
\textcolor{blue}{Bayesian Sparse Linear Mixed Model Options}
%
\begin{itemize}
\item \textcolor{red}{-bslmm [num]} \quad specify analysis choice (default 1; valid value 1-3; 1: linear BSLMM; 2: ridge regression/GBLUP; 3: probit BSLMM.)
\item \textcolor{red}{-hmin [num] } \quad specify minimum value for h (default 0)
\item \textcolor{red}{-hmax [num]} \quad specify maximum value for h (default 1)
\item \textcolor{red}{-rmin [num]} \quad specify minimum value for rho (default 0)
\item \textcolor{red}{-rmax [num]} \quad specify maximum value for rho (default 1)
\item \textcolor{red}{-pmin [num]} \quad specify minimum value for log10(pi) (default log10(1/p), where p is the number of analyzed SNPs )
\item \textcolor{red}{-pmax [num]} \quad specify maximum value for log10(pi) (default log10(1) )
\item \textcolor{red}{-smin [num]} \quad specify minimum value for |gamma| (default 0)
\item \textcolor{red}{-smax [num]} \quad specify maximum value for |gamma| (default 300)
\item \textcolor{red}{-gmean [num]} \quad specify the mean for the geometric distribution (default: 2000)
\item \textcolor{red}{ -hscale [num]} \quad specify the step size scale for the proposal distribution of h (value between 0 and 1, default min(10/sqrt(n),1) )
\item \textcolor{red}{-rscale [num]} \quad specify the step size scale for the proposal distribution of rho (value between 0 and 1, default min(10/sqrt(n),1) )
\item \textcolor{red}{-pscale [num] } \quad specify the step size scale for the proposal distribution of log10(pi) (value between 0 and 1, default min(5/sqrt(n),1) )
\item \textcolor{red}{-w [num]} \quad specify burn-in steps (default 100,000)
\item \textcolor{red}{-s [num] } \quad specify sampling steps (default 1,000,000)
\item \textcolor{red}{-rpace [num]} \quad specify recording pace, record one state in every [num] steps (default 10)
\item \textcolor{red}{-wpace [num]} \quad specify writing pace, write values down in every [num] recorded steps (default 1000)
\item \textcolor{red}{-seed [num] } \quad specify random seed (a random seed is generated by default)
\item \textcolor{red}{-mh [num]} \quad specify number of MH steps in each iteration (default 10; requires 0/1 phenotypes and -bslmm 3 option)
\end{itemize}
%
\textcolor{blue}{Prediction Options}
%
\begin{itemize}
\item \textcolor{red}{-predict [num]} \quad specify prediction options (default 1; valid value 1-2; 1: predict for individuals with missing phenotypes; 2: predict for individuals with missing phenotypes, and convert the predicted values using normal CDF.)
\end{itemize}
%
\textcolor{blue}{Variance Component Estimation Options}
%
\begin{itemize}
\item \textcolor{red}{-vc [num]} \quad specify fitting algorithm. For individual level data (default 1; valid value 1-2; 1: HE regression; 2: REML AI algorithm.). For summary statistics (default 1; valid value 1-2; 1: MQS-HEW; 2: MQS-LDW.)
\item \textcolor{red}{-ci [num]} \quad specify fitting algorithm to compute the standard errors. (default 1; valid value 1-2; 1: MQS-HEW; 2: MQS-LDW.)
\end{itemize}
%
\clearpage
\bibliographystyle{plain}
\bibliography{manual}
\end{document}
|