diff options
| author | Pjotr Prins | 2025-11-30 10:12:46 +0100 |
|---|---|---|
| committer | Pjotr Prins | 2025-11-30 10:12:46 +0100 |
| commit | ca03f0043f26b6019a613ae5fb0ea2cf45c52108 (patch) | |
| tree | d5738a962efca65c17deacfa3fc665db48554695 /src | |
| parent | ab434bbd38a61941bf32afcf91b7cc397462a53b (diff) | |
| download | pangemma-ca03f0043f26b6019a613ae5fb0ea2cf45c52108.tar.gz | |
Extracting MAF computation
Diffstat (limited to 'src')
| -rw-r--r-- | src/gemma_io.cpp | 64 | ||||
| -rw-r--r-- | src/gemma_io.h | 2 | ||||
| -rw-r--r-- | src/param.cpp | 2 |
3 files changed, 43 insertions, 25 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index baa44b9..f176dac 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -650,6 +650,32 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno, return true; } +double compute_maf(size_t ni_total, const gsl_vector *gs) { + double maf = 0; + size_t n_0 = 0; + size_t n_1 = 0; + size_t n_2 = 0; + + for (int i = 0; i < ni_total; ++i) { // read genotypes + double geno; + gsl_vector_get(gs, i); + if (geno >= 0 && geno <= 0.5) { + n_0++; + } + if (geno > 0.5 && geno < 1.5) { + n_1++; + } + if (geno >= 1.5 && geno <= 2.0) { + n_2++; + } + maf += geno; + } + auto ni_test = ni_total; + auto n_miss = 0; + maf /= 2.0 * (double)(ni_test - n_miss); + return maf; +} + // Read bimbam mean genotype file, the first time, to obtain #SNPs for // analysis (ns_test) and total #SNP (ns_total). /* @@ -736,9 +762,8 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, double cM; size_t file_pos; - double maf, geno, geno_old; + double geno, geno_old; size_t n_miss; - size_t n_0, n_1, n_2; double min_g = std::numeric_limits<float>::max(); double max_g = std::numeric_limits<float>::min(); @@ -749,6 +774,7 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, for (int i = 0; i < ni_total; ++i) { ni_test += indicator_idv[i]; } + assert(ni_test == ni_total); // we use indicator_idv? ns_test = 0; file_pos = 0; @@ -795,15 +821,13 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, } // Start on a new marker/SNP - maf = 0; n_miss = 0; flag_poly = 0; geno_old = -9; - n_0 = 0; - n_1 = 0; - n_2 = 0; c_idv = 0; gsl_vector_set_zero(genotype_miss); + + auto infilen = file_geno.c_str(); for (int i = 0; i < ni_total; ++i) { // read genotypes ch_ptr = strtok_safe2(NULL, " ,\t",(string("To many individuals/strains/genometypes in pheno file for ")+infilen).c_str()); @@ -820,15 +844,6 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, } geno = atof(ch_ptr); - if (geno >= 0 && geno <= 0.5) { - n_0++; - } - if (geno > 0.5 && geno < 1.5) { - n_1++; - } - if (geno >= 1.5 && geno <= 2.0) { - n_2++; - } gsl_vector_set(genotype, c_idv, geno); if (geno < min_g) min_g = geno; @@ -842,12 +857,10 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, if (flag_poly == 2 && geno != geno_old) { flag_poly = 1; // genotypes differ } - - maf += geno; - c_idv++; } - maf /= 2.0 * (double)(ni_test - n_miss); + + double maf = compute_maf(ni_total,genotype); SNPINFO sInfo = {chr, rs, cM, b_pos, @@ -877,14 +890,14 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, } // -hwe flag + /* FIXME FIXME if (hwe_level != 0 && maf_level != -1) { if (CalcHWE(n_0, n_2, n_1) < hwe_level) { indicator_snp.push_back(0); continue; } } - - + */ // -r2 flag for (size_t i = 0; i < genotype->size; ++i) { if (gsl_vector_get(genotype_miss, i) == 1) { @@ -892,7 +905,6 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, gsl_vector_set(genotype, i, geno); } } - gsl_blas_dgemv(CblasTrans, 1.0, W, genotype, 0.0, Wtx); gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); gsl_blas_ddot(genotype, genotype, &v_x); @@ -1477,7 +1489,7 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { // Read bimbam mean genotype file and calculate kinship matrix. -gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode, const vector<int> &indicator_snp) { +gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) { checkpoint("mdb-kin",file_geno); /* Create and open the LMDB environment: */ @@ -1515,12 +1527,14 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode, num_markers = stat.ms_entries; gsl_matrix *matrix_kin = gsl_matrix_safe_alloc(ni_total,ni_total); + gsl_matrix_set_zero(matrix_kin); gsl_vector *geno = gsl_vector_safe_alloc(ni_total); gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total); // Xlarge contains inds x markers gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, K_BATCH_SIZE); enforce_msg(Xlarge, "allocate Xlarge ni_total x K_BATCH_SIZE"); + gsl_matrix_set_zero(Xlarge); // [ ] check XLarge is zeroed properly // [ ] handle missing data @@ -1540,6 +1554,10 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode, string_view key, value; cursor.get(key, value, mdb_fetch); mdb_fetch = MDB_NEXT; + /* FIXME! + if (indicator_snp[t] == 0) + continue; + */ size_t num_floats = value.size() / sizeof(float); assert(num_floats == ni_total); const float* gs = reinterpret_cast<const float*>(value.data()); diff --git a/src/gemma_io.h b/src/gemma_io.h index de74ada..4e9f4c1 100644 --- a/src/gemma_io.h +++ b/src/gemma_io.h @@ -87,7 +87,7 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv, void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U); void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval); -gsl_matrix *mdb_calc_kin(const string file_geno, const bool is_loco, const int mode, const vector<int> &indicator_snp); +gsl_matrix *mdb_calc_kin(const string file_geno, const bool is_loco, const int mode); bool BimbamKin(const string file_geno, const set<string> ksnps, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin, diff --git a/src/param.cpp b/src/param.cpp index 1adee9e..f799ef7 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -1294,7 +1294,7 @@ void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_ } gsl_matrix *PARAM::MdbCalcKin() { - return mdb_calc_kin(file_geno, is_loco(), a_mode - 20, indicator_snp); + return mdb_calc_kin(file_geno, is_loco(), a_mode - 20); } void PARAM::CalcKin(gsl_matrix *matrix_kin) { |
