summary refs log tree commit diff
diff options
context:
space:
mode:
authorPjotr Prins2025-12-17 13:28:26 +0100
committerPjotr Prins2026-01-05 11:12:11 +0100
commit28ea75d9f17c38bc769281ecc6f11ab6bd7e4fd2 (patch)
tree8b0478f46bb6dd301e4fc05447c5ee506fbe38d8
parent384023cd12788b4d1aa723ea01f1ab514515decd (diff)
downloadgn-gemtext-28ea75d9f17c38bc769281ecc6f11ab6bd7e4fd2.tar.gz
Minor edits
-rw-r--r--issues/systems/apps.gmi9
-rw-r--r--topics/genetics/test-pangenome-derived-genotypes.gmi66
2 files changed, 66 insertions, 9 deletions
diff --git a/issues/systems/apps.gmi b/issues/systems/apps.gmi
index 12065a5..e374250 100644
--- a/issues/systems/apps.gmi
+++ b/issues/systems/apps.gmi
@@ -196,7 +196,7 @@ Container is at
 
 => https://git.genenetwork.org/gn-machines/tree/gn/services/mouse-longevity.scm
 
-gaeta:~/iwrk/deploy/gn-machines$ guix system container -L . -L ~/guix-bioinformatics --verbosity=3 test-r-container.scm -L ~/iwrk/deploy/guix-forge/guix 
+gaeta:~/iwrk/deploy/gn-machines$ guix system container -L . -L ~/guix-bioinformatics --verbosity=3 test-r-container.scm -L ~/iwrk/deploy/guix-forge/guix
 forge/nginx.scm:145:40: error: acme-service-type: unbound variable
 hint: Did you forget `(use-modules (forge acme))'?
 
@@ -205,6 +205,8 @@ hint: Did you forget `(use-modules (forge acme))'?
 
 Jumpshiny is hosted on balg01. Scripts are in tux02 git.
 
+=> git.genenetwork.org:/home/git/shared/source/jumpshiny
+
 ```
 root@balg01:/home/j*/gn-machines# . /usr/local/guix-profiles/guix-pull/etc/profile
 guix system container --network -L . -L ../guix-forge/guix/ -L ../guix-bioinformatics/ -L ../guix-past/modules/ --substitute-urls='https://ci.guix.gnu.org https://bordeaux.guix.gnu.org https://cuirass.genenetwork.org' test-r-container.scm -L ../guix-forge/guix/gnu/store/xyks73sf6pk78rvrwf45ik181v0zw8rx-run-container
@@ -216,3 +218,8 @@ Currently:
 Jumpshiny: as aijun, cd services/jumpshiny and ./.guix-run
 
 
+## JUMPsem_web
+
+Another shiny app to run on balg01.
+
+Jumpshiny: as aijun, cd services/jumpsem and ./.guix-run
diff --git a/topics/genetics/test-pangenome-derived-genotypes.gmi b/topics/genetics/test-pangenome-derived-genotypes.gmi
index 4f806ee..3a8473a 100644
--- a/topics/genetics/test-pangenome-derived-genotypes.gmi
+++ b/topics/genetics/test-pangenome-derived-genotypes.gmi
@@ -9,6 +9,9 @@ For the BXD we have 23M markers(!) whereof 8M *not* on the reference genome.
 
 # Tasks
 
+* [ ] Document lmdb geno and marker information
+* [ ] Extract epoch information
+* [ ] Add BED file and link SNPS
 * [ ] Check MAF filter - it may be too stringent
 * [ ] Use ravanan/CWL to push to Octopus
 * [ ] Reintroduce nodes that were not annotated for position (Flavia)
@@ -20,7 +23,24 @@ For the BXD we have 23M markers(!) whereof 8M *not* on the reference genome.
 
 # Summary
 
-To get the mapping and generate the assoc output in mdb format we run a variant of gemma-wrapper
+To get the mapping and generate the assoc output in mdb format we run a variant of gemma-wrapper.
+
+The workflow essentially is:
+
+* capture the significant markers from GEMMA's mdb output (as created by gemma-wrapper)
+* These are transformed into RDF using the 'gemma-mdb-to-rdf.rb' script
+* Next we upload that RDF into virtuoso
+* from there download a table of start-stop data using SPARQL
+* We compute QTL locations using 'sparql-qtl-detect.rb'
+* Upload that RDF also into virtuoso
+
+For mapping virtuoso contains four important ttl files:
+
+* marker positions in pangenome-marker graph
+* mapped markers in pangenome-mapped graph
+* computed QTL positions in pangenome-qtl graph
+* trait values in traits graph (nyi)
+
 
 ```
 gemma-batch-run.sh
@@ -30,12 +50,11 @@ Next we convert that output to RDF with
 
 ```
 ../bin/gemma-mdb-to-rdf.rb --header > output.ttl
-../bin/gemma-mdb-to-rdf.rb --anno snps-matched.txt.mdb tmp/panlmm/*-gemma-GWA.tar.xz >> test-run-3000.ttl
-serdi -i turtle -o ntriples test-run-3000.ttl > test-run-3000.n3
+time ../bin/gemma-mdb-to-rdf.rb --anno snps-matched.txt.mdb tmp/panlmm/*-gemma-GWA.tar.xz >> output.ttl # two hours for 7000 traits
+time serdi -i turtle -o ntriples output.ttl > output.n3
 ```
 
-(serdi does better than rapper with huge files) and
-copy the file to the virtuoso instance and load it with isql:
+(note that n3 files are less error prone and serdi does better than rapper with huge files) and copy the file to the virtuoso instance and load it with isql (note it may be worth search-replacing the gnt:run tag to something descriptive).
 
 ```
 cd /export/guix-containers/virtuoso/data/virtuoso/ttl/
@@ -47,13 +66,26 @@ SQL> SELECT * FROM DB.DBA.load_list;
 SQL> DELETE from DB.DBA.LOAD_LIST where ll_error IS NOT NULL ;
 SQL> DELETE from DB.DBA.LOAD_LIST where LL_STATE = 1;
 # commit changes
-SQL> rdf_loader_run ();
+SQL> rdf_loader_run (); // about 1 min per GB n3
 SQL> checkpoint;
 Done. -- 16 msec.
 SQL> SPARQL SELECT count(*) FROM <http://pan-test.genenetwork.org> WHERE { ?s ?p ?o } LIMIT 10;
 34200686
 ```
 
+Note it may be a good idea to drop graphs first. That is why we have separate subgraph spaces for every large TTL file:
+
+```
+log_enable(3,1);
+SQL> SPARQL CLEAR GRAPH  <http://pan-test.genenetwork.org>;
+SQL> SPARQL CLEAR GRAPH  <http://pan-mapped.genenetwork.org>; // 10 min
+SQL> SPARQL CLEAR GRAPH  <http://pangenome-marker.genenetwork.org>;
+SQL> ld_dir('/export/data/virtuoso/ttl','pangenome-markers.n3','http://pangenome-marker.genenetwork.org');
+SQL> SPARQL SELECT count(*) FROM <http://pan-test.genenetwork.org> WHERE { ?s ?p ?o } LIMIT 10;
+```
+
+For pangenomes we have a marker file, a QTL file
+
 As a test, fetch a table of the traits with their SNPs
 
 ```
@@ -71,7 +103,7 @@ PREFIX qb: <http://purl.org/linked-data/cube#>
 PREFIX xkos: <http://rdf-vocabulary.ddialliance.org/xkos#>
 PREFIX pubmed: <http://rdf.ncbi.nlm.nih.gov/pubmed/>
 
-SELECT * FROM <http://pan-test.genenetwork.org> WHERE {
+SELECT * FROM <http://pangenome-mapped.genenetwork.org> WHERE {
 ?traitid a gnt:mappedTrait;
          gnt:run gn:test .
 ?snp gnt:mappedSnp ?traitid ;
@@ -88,13 +120,31 @@ FILTER (contains(?nodeid,"Marker") && ?pos < 1000)
 OK, we are ready to run a little workflow. First create a sorted list of IDs.
 
 ```
-SELECT DISTINCT ?trait FROM <http://pan-test.genenetwork.org> WHERE {
+PREFIX dct: <http://purl.org/dc/terms/>
+PREFIX gn: <http://genenetwork.org/id/>
+PREFIX owl: <http://www.w3.org/2002/07/owl#>
+PREFIX gnc: <http://genenetwork.org/category/>
+PREFIX gnt: <http://genenetwork.org/term/>
+PREFIX sdmx-measure: <http://purl.org/linked-data/sdmx/2009/measure#>
+PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+PREFIX qb: <http://purl.org/linked-data/cube#>
+PREFIX xkos: <http://rdf-vocabulary.ddialliance.org/xkos#>
+PREFIX pubmed: <http://rdf.ncbi.nlm.nih.gov/pubmed/>
+
+SELECT DISTINCT ?trait FROM <http://pangenome-mapped.genenetwork.org> WHERE {
 ?traitid a gnt:mappedTrait;
          gnt:run gn:test ;
          gnt:traitId ?trait.
 }
 ```
 
+See also
+
+=> https://github.com/genetics-statistics/gemma-wrapper/blob/master/doc/examples/list-traits.sparql
+
 Sort that list and save as 'pan-ids-sorted.txt'. Next run
 
 ```