diff options
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | README.md | 66 | ||||
| -rw-r--r-- | conn.scm | 7 | ||||
| -rw-r--r-- | etc/sample.json | 8 | ||||
| -rwxr-xr-x | examples/classification.scm | 129 | ||||
| -rwxr-xr-x | examples/dataset-metadata.scm | 450 | ||||
| -rwxr-xr-x | examples/genbank.scm | 29 | ||||
| -rwxr-xr-x | examples/genelist.scm | 200 | ||||
| -rwxr-xr-x | examples/generif.scm | 265 | ||||
| -rwxr-xr-x | examples/genotype.scm | 72 | ||||
| -rwxr-xr-x | examples/phenotype.scm | 64 | ||||
| -rwxr-xr-x | examples/probeset.scm | 203 | ||||
| -rwxr-xr-x | examples/publication.scm | 6 | ||||
| -rwxr-xr-x | examples/schema.scm | 70 | ||||
| -rwxr-xr-x | examples/strains.scm | 52 | ||||
| -rwxr-xr-x | examples/tissue.scm | 9 | ||||
| -rwxr-xr-x | generate-ttl-files.scm | 127 | ||||
| -rwxr-xr-x | json-to-ttl.scm (renamed from json-dump.scm) | 2 | ||||
| -rwxr-xr-x | load-rdf.scm | 2 | ||||
| -rw-r--r-- | manifest.scm | 6 | ||||
| -rw-r--r-- | schema/species.ttl | 4 | ||||
| -rw-r--r-- | transform/schema.scm (renamed from transform/schema-dump.scm) | 20 | ||||
| -rw-r--r-- | transform/special-forms.scm | 70 | ||||
| -rw-r--r-- | transform/sql.scm | 19 | ||||
| -rw-r--r-- | transform/strings.scm | 23 | ||||
| -rw-r--r-- | transform/triples.scm | 24 | ||||
| -rw-r--r-- | transform/uuid.scm | 234 | ||||
| -rwxr-xr-x | visualize-schema.scm | 26 |
28 files changed, 1008 insertions, 1180 deletions
diff --git a/.gitignore b/.gitignore index cd9b92c..1219888 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.go /**/*~ +/tmp/* diff --git a/README.md b/README.md index 8c94ccd..c8efad2 100644 --- a/README.md +++ b/README.md @@ -28,22 +28,21 @@ $ make or for a container ```shell -mkdir test +mkdir ./tmp guix shell -C --network --share=/run/mysqld/ --manifest=manifest.scm export GUILE_LOAD_PATH=.:$GUILE_LOAD_PATH -guile json-dump.scm conn.scm test/ +guile json-to-ttl.scm etc/sample.json tmp/ ``` +That reads the `etc/sample.json` file included in this repository and converts it to an RDF representation that is stored in a file `./tmp/sampledata.ttl`. + ## Set up connection parameters -Describe the database connection parameters in a file *conn.scm* file as -shown below. Take care to replace the placeholders within angle brackets -with the appropriate values. +Describe the database connection parameters in a file *conn.scm* file as shown below. Take care to replace the placeholders within angle brackets with the appropriate values. ``` scheme -((generif-data-file . "/path/to/generifs_basic.gz") - (sql-username . "<sql-username-here>") +((sql-username . "<sql-username-here>") (sql-password . "<sql-password-here>") (sql-database . "<sql-database-name-here>") (sql-host . "<sql-hostname-here>") @@ -56,14 +55,9 @@ with the appropriate values. (sparql-port . <sparql-endpoint-port-here>)) ``` -Download the GeneRIF data file from -https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz and specify -its path in the `generif-data-file` parameter. - Here's a sample *conn.scm*. ``` scheme -((generif-data-file . "/home/gn/generifs_basic.gz") - (sql-username . "webqtlout") +((sql-username . "webqtlout") (sql-password . "my-secret-password") (sql-database . "db_webqtl") (sql-host . "localhost") @@ -76,30 +70,58 @@ Here's a sample *conn.scm*. (sparql-port . 9082)) ``` -## Dump the database +## Transform the database -Then, to dump the database to \~/data/dump, run inside shell +Example: Transform the phenotype from SQL to Terse RDF Triple Language (TTL) ```sh -./pre-inst-env ./examples/dump-species-metadata.scm ../conn.scm ~/tmp +guile -s examples/phenotype.scm \ + --settings=conn.scm \ + --output=tmp/phenotype.ttl \ + --documentation=tmp/phenotype.ttl.md ``` -``` shell -$ guix shell -m manifest.scm -- ./pre-inst-env ./examples/dump-dataset-metadata.scm ../conn.scm ~/tmp +the `-s` option to *guile* runs the `examples/phenotype.scm` file as a script. Everything else on the command line is passed onto the script as command-line arguments. + +This should create the files: +- `tmp/phenotype.ttl`: will contain the data in the database in TTL format +- `tmp/phenotype.ttl.md`: will contain a short documentation on the data in the file above. + +**Note to Devs**: The current `pre-inst-env` script will not work within containers since it assumes the existence of `/usr/bin/env`. We need to fix that if we intend to keep using that. + + +There is a shorter form of the command above: + +```sh +guile -s examples/phenotype.scm \ + -s conn.scm \ + -o tmp/phenotype.ttl \ + -d tmp/phenotype.ttl.md +``` + +which does the same thing, but has the potential to be confusing due to the two `-s` options: the first `-s` option is to guile while the second is to the script itself. + +There's an extra script that loops through all the scheme files in examples and runs them. To run it: + +```sh +./generate-ttl-files.scm -s conn.scm -o <ttl-output-directory> -d <docs-output-directory> ``` ## Validate and load dump -Then, validate the dumped RDF using `rapper` and load it into -virtuoso. This will load the dumped RDF into the -`http://genenetwork.org` graph, and will delete all pre-existing data -in that graph (FIXME) +Then, validate the dumped RDF using `rapper`: ``` shell $ guix shell -m manifest.scm -- rapper --input turtle --count ~/data/dump/dump.ttl +``` + +If there are no errors, load the relevant RDF files into the `http://genenetwork.org` graph using the `load-rdf.scm` script: + +``` shell $ guix shell -m manifest.scm -- ./pre-inst-env ./load-rdf.scm conn.scm ~/data/dump/dump.ttl ``` +This `load-rdf.scm` script replaces the existing graph with the ttl files from: "/var/lib/data", and indexes all the text data for quicker searches. ## Upload data to virtuoso diff --git a/conn.scm b/conn.scm index aca2835..483fe9c 100644 --- a/conn.scm +++ b/conn.scm @@ -1,12 +1,11 @@ ((sql-username . "webqtlout") - (sql-password . "*") + (sql-password . "webqtlout") (sql-database . "db_webqtl") (sql-host . "localhost") (sql-port . 3306) - (virtuoso-port . 8891) + (virtuoso-port . 1111) (virtuoso-username . "dba") (virtuoso-password . "*") (sparql-scheme . http) (sparql-host . "localhost") - (sparql-port . 8892) - (generif-data-file . "/export3/local/home/bonfacem/dump-genenetwork-database/generifs_basic.gz")) + (sparql-port . 9082)) diff --git a/etc/sample.json b/etc/sample.json new file mode 100644 index 0000000..b32e24c --- /dev/null +++ b/etc/sample.json @@ -0,0 +1,8 @@ +{ + "metadata": { + "name": "some-metadata-item-name", + "displayName": "the-item-display-name", + "createdBy": "Frederick M. Muriithi" + "why": "to-demo-usage-of-system-and-for-documentation" + } +} diff --git a/examples/classification.scm b/examples/classification.scm index 3024af6..e3da8da 100755 --- a/examples/classification.scm +++ b/examples/classification.scm @@ -13,81 +13,65 @@ -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) - ;; Classification Scheme (define-transformer classification-scheme-species (tables (Species)) (schema-triples - (gnc:ResourceClassificationScheme a skos:ConceptScheme) - (gnc:ResourceClassificationScheme skos:prefLabel "GeneNetwork Classification Scheme For Resources") - (gnc:ResourceClassificationScheme xkos:numberOfLevels "3") - (gnc:ResourceClassificationScheme xkos:levels "( gnc:DatasetType gnc:Set gnc:Species )") - (gnc:DatasetType a xkos:ClassificationLevel) - (gnc:DatasetType skos:prefLabel "The Type of a Dataset which can be a ProbeSet, Genotype, or Phenotype") - (gnc:DatasetType xkos:depth "1") - (gnc:DatasetType skos:member gnc:Probeset) - (gnc:DatasetType skos:member gnc:Genotype) - (gnc:DatasetType skos:member gnc:Phenotype) - (gnc:Probeset skos:prefLabel "mRNA Assay Datasets") - (gnc:Probeset skos:altLabel "ProbeSet") - (gnc:Genotype skos:prefLabel "Genotype") - (gnc:Genotype skos:altLabel "DNA Markers and SNPs") - (gnc:Phenotype skos:prefLabel "Phenotype") - (gnc:Phenotype skos:altLabel "Traits and Cofactors") - (gnc:Species a xkos:ClassificationLevel) - (gnc:Species skos:prefLabel "The species in which this resource belongs") - (gnc:Species xkos:depth "3") - (gnc:Species xkos:specializes gnc:Set)) - (triples "gnc:Species" + (gnc:resource_classification_scheme a skos:ConceptScheme) + (gnc:resource_classification_scheme skos:prefLabel "GeneNetwork Classification Scheme For Resources which are either defines as a dataset, an inbred group, or a species.") + (gnc:resource_classification_scheme xkos:numberOfLevels "3") + (gnc:resource_classification_scheme xkos:levels "( gnc:dataset_type gnc:set gnc:species )") + (gnc:dataset_type a xkos:ClassificationLevel) + (gnc:dataset_type skos:prefLabel "The Type of a Dataset which can be a ProbeSet, Genotype, or Phenotype") + (gnc:dataset_type xkos:depth "1") + (gnc:dataset_type skos:member gnc:probeset) + (gnc:dataset_type skos:member gnc:genotype) + (gnc:dataset_type skos:member gnc:phenotype) + (gnc:probeset skos:prefLabel "mRNA Assay Datasets") + (gnc:probeset skos:altLabel "ProbeSet") + (gnc:genotype skos:prefLabel "Genotype") + (gnc:genotype skos:altLabel "DNA Markers and SNPs") + (gnc:phenotype skos:prefLabel "Phenotype") + (gnc:phenotype skos:altLabel "Traits and Cofactors") + (gnc:species a xkos:ClassificationLevel) + (gnc:species skos:prefLabel "The species in which this resource belongs") + (gnc:species xkos:depth "3") + (gnc:species xkos:specializes gnc:set)) + (triples "gnc:species" (set skos:member - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)))) + (string->identifier "" (remap-species-identifiers (field Species Fullname)))))) (define-transformer classification-scheme-set (tables (InbredSet)) (schema-triples - (gnc:Set a xkos:ClassificationLevel) - (gnc:Set skos:prefLabel "The Type of Set, Ie InbredSet/OutbredSet that a resource can belong to") - (gnc:Set xkos:depth "2") - (gnc:Set xkos:generalizes gnc:Species)) - (triples "gnc:Set" + (gnc:set a xkos:ClassificationLevel) + (gnc:set skos:prefLabel "The Type of Set, Ie InbredSet/OutbredSet that a resource can belong to") + (gnc:set xkos:depth "2") + (gnc:set xkos:generalizes gnc:species)) + (triples "gnc:set" (set skos:member (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first)))) + "set" (field InbredSet Name InbredSetName) #:separator "_")))) (define-transformer species (tables (Species)) (schema-triples (gnt:family a owl:ObjectProperty) - (gnt:family rdfs:domain gnc:Species) + (gnt:family rdfs:domain gnc:species) (gnt:family skos:definition "This resource belongs to this family") - (gnt:shortName a owl:ObjectProperty) - (gnt:shortName rdfs:domain gnc:Species) - (gnt:shortName skos:definition "The short name of a given resource") - (gnt:belongsToSpecies a rdf:property) - (gnt:belongsToSpecies rdf:comment "This resource given to this species") - (gnt:belongsToSpecies rdf:label "belongsToSpecies")) + (gnt:short_name a owl:ObjectProperty) + (gnt:short_name rdfs:domain gnc:species) + (gnt:short_name skos:definition "The short name of a given resource") + (gnt:belongs_to_species a rdf:property) + (gnt:belongs_to_species rdf:comment "This resource belongs to this species") + (gnt:belongs_to_species rdf:label "belongsToSpecies")) (triples - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first) - (set skos:inScheme 'gnc:ResourceClassificationScheme) + (string->identifier "" (remap-species-identifiers (field Species Fullname))) + (set skos:inScheme 'gnc:resource_classification_scheme) (set rdfs:label (remap-species-identifiers (field Species Fullname))) (set skos:prefLabel (field Species MenuName)) (set skos:altLabel (field Species SpeciesName)) - (set gnt:shortName (field Species Name)) + (set gnt:short_name (field Species Name)) (set gnt:family (field Species Family)) (set skos:notation (ontology 'taxon: @@ -99,32 +83,27 @@ (left-join MappingMethod "ON InbredSet.MappingMethodId=MappingMethod.Id"))) (schema-triples - (gnt:geneticType a owl:ObjectProperty) - (gnt:geneticType rdfs:domain gnc:set) + (gnt:genetic-type a owl:ObjectProperty) + (gnt:genetic-type rdfs:domain gnc:set) (gnt:code a owl:ObjectProperty) (gnt:code rdfs:domain gnc:set) ;; Already defined as an owl prop in species - (gnt:family rdfs:domain gnc:Set) - (gnt:mappingMethod a owl:ObjectProperty) - (gnt:mappingMethod rdfs:domain gnc:set) - (gnt:belongsToGroup a rdf:property) - (gnt:belongsToGroup rdf:comment "This resource given to this group") - (gnt:belongsToGroup rdf:label "belongsToGroup")) - (triples (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first) - (set skos:inScheme 'gnc:ResourceClassificationScheme) + (gnt:family rdfs:domain gnc:set) + (gnt:mapping_method a owl:ObjectProperty) + (gnt:mapping_method rdfs:domain gnc:set) + (gnt:belongs_to_group a rdf:property) + (gnt:belongs_to_group rdf:comment "This resource given to this group") + (gnt:belongs_to_group rdf:label "belongs-to-group")) + (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") + (set skos:inScheme 'gnc:resource_classification_scheme) (set rdfs:label (field InbredSet FullName)) (set skos:prefLabel (field InbredSet Name InbredSetName)) - (set gnt:geneticType (field InbredSet GeneticType)) + (set gnt:genetic-type (field InbredSet GeneticType)) (set gnt:family (field InbredSet Family)) - (set gnt:mappingMethod (field MappingMethod Name)) + (set gnt:mapping_method (field MappingMethod Name)) (set gnt:code (field InbredSet InbredSetCode)) (set xkos:generalizes - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)))) + (string->identifier "" (remap-species-identifiers (field Species Fullname)))))) @@ -145,10 +124,10 @@ (connection %connection-settings) (table-metadata? #f) (prefixes - '(("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") + '(("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("gnt:" "<http://genenetwork.org/term/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") diff --git a/examples/dataset-metadata.scm b/examples/dataset-metadata.scm index 5c59530..fc36a8f 100755 --- a/examples/dataset-metadata.scm +++ b/examples/dataset-metadata.scm @@ -12,15 +12,6 @@ (transform special-forms)) -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) ;; One email ID in the Investigators table has spaces in it. This ;; function fixes that. @@ -36,7 +27,8 @@ (string->identifier "investigator" (string-join (list first-name last-name (fix-email-id email)) - "_"))) + "_") + #:separator "_")) (define-transformer investigators ;; There are a few duplicate entries. We group by email to @@ -45,7 +37,7 @@ "GROUP BY Email") (triples (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) - (field Investigators Email)) + "") (set rdf:type 'foaf:Person) (set foaf:name (string-append (field Investigators FirstName) " " (field Investigators LastName))) @@ -64,27 +56,25 @@ (tables (GeneChip (left-join Species "USING (SpeciesId)"))) (schema-triples - (gnc:geneChip a skos:Concept) - (gnc:geneChip + (gnc:gene_chip a skos:Concept) + (gnc:gene_chip skos:description "This is a set of controlled terms that are used to describe a given gene chip/platform") - (gnt:hasGeoSeriesId rdfs:domain gnc:platform) - (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip) - (gnt:hasGOTreeValue a owl:ObjectProperty) - (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value") - (gnt:hasGOTreeValue rdfs:domain gnc:geneChip)) - (triples (string->identifier "platform" (field GeneChip Name)) - (set rdf:type 'gnc:geneChip) + (gnt:has_geo_series_id rdfs:domain gnc:platform) + (gnt:has_geo_series_id rdfs:domain gnc:gene_chip) + (gnt:has_go_tree_value a owl:ObjectProperty) + (gnt:has_go_tree_value skos:definition "This resource the following GO tree value") + (gnt:has_go_tree_value rdfs:domain gnc:gene_chip)) + (triples (string->identifier "platform" (field GeneChip Name) #:separator "_") + (set rdf:type 'gnc:gene_chip) (set rdfs:label (field GeneChip GeneChipName)) (set skos:prefLabel (field GeneChip Name)) (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" Title))) - (set gnt:hasGOTreeValue (field GeneChip Go_tree_value)) + (set gnt:has_go_tree_value (field GeneChip Go_tree_value)) (set xkos:classifiedUnder - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:hasGeoSeriesId + (string->identifier "" (remap-species-identifiers (field Species Fullname)) #:separator "")) + (set gnt:has_geo_series_id (ontology 'geoSeries: (string-trim-both (field GeneChip GeoPlatform)))))) @@ -107,109 +97,284 @@ ;; if they exist in the (Publish/Geno)Freeze tables. "LEFT JOIN InbredSet PublishInbredSet ON PublishFreeze.InbredSetId = PublishInbredSet.InbredSetId LEFT JOIN InbredSet GenoInbredSet ON GenoFreeze.InbredSetId = GenoInbredSet.InbredSetId WHERE GN_AccesionId IS NOT NULL") (schema-triples - (gnt:hasTissue rdfs:domain dcat:Dataset) - (gnt:hasTissue a owl:ObjectProperty) - (gnt:hasTissue skos:definition "Tissues this resource has") - (gnt:usesNormalization rdfs:domain dcat:Dataset) - (gnt:usesNormalization a owl:ObjectProperty) - (gnt:usesNormalization skos:definition "Normalization techniques this resource has") - (gnt:usesPlatform rdfs:domain dcat:Dataset) - (gnt:usesPlatform a owl:ObjectProperty) - (gnt:usesPlatform skos:definition "The Platform this resource uses") - (gnt:hasGeoSeriesId rdfs:domain dcat:Dataset) - (gnt:hasGeoSeriesId a owl:ObjectProperty) - (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database") - (gnt:hasExperimentType rdfs:domain dcat:Dataset) - (gnt:hasExperimentType a owl:ObjectProperty) - (gnt:hasExperimentType rdfs:label "Experiment Type Metadata") - (gnt:hasExperimentType skos:definition "Information about the experiment type") - (gnt:hasTissueInfo rdfs:domain dcat:Dataset) - (gnt:hasTissueInfo a owl:ObjectProperty) - (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource") - (gnt:hasExperimentDesignInfo rdfs:domain dcat:Dataset) - (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design") - (gnt:hasExperimentDesignInfo a owl:ObjectProperty) - (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed") - (gnt:hasNotes rdfs:domain dcat:Dataset) - (gnt:hasNotes a owl:ObjectProperty) - (gnt:hasNotes rdfs:label "Notes") - (gnt:hasNotes skos:definition "Extra Notes about this dataset") - (gnt:hasDataProcessingInfo rdfs:domain dcat:Dataset) - (gnt:hasDataProcessingInfo rdfs:label "About Data Processing") - (gnt:hasDataProcessingInfo a owl:ObjectProperty) - (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed") - (gnt:hasPlatformInfo rdfs:domain dcat:Dataset) - (gnt:hasPlatformInfo a owl:ObjectProperty) - (gnt:hasPlatformInfo rdfs:label "About Platform") - (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset") - (gnt:hasCaseInfo rdfs:domain dcat:Dataset) - (gnt:hasCaseInfo rdfs:label "About Case") - (gnt:hasCaseInfo a owl:ObjectProperty) - (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform") - (gnt:hasExperimentType skos:definition "Information about the experiment type") - (gnt:hasAcknowledgement rdfs:domain dcat:Dataset) - (gnt:hasAcknowledgement rdfs:label "Acknowledgement") - (gnt:hasAcknowledgement a owl:ObjectProperty) - (gnt:hasAcknowledgement skos:definition "People to acknowledge")) - (triples (string->identifier - "" (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)) + (gnt:has_tissue rdfs:domain dcat:Dataset) + (gnt:has_tissue a owl:ObjectProperty) + (gnt:has_tissue skos:definition "Tissues this resource has") + (gnt:uses_normalization rdfs:domain dcat:Dataset) + (gnt:uses_normalization a owl:ObjectProperty) + (gnt:uses_normalization skos:definition "Normalization techniques this resource has") + (gnt:uses_platform rdfs:domain dcat:Dataset) + (gnt:uses_platform a owl:ObjectProperty) + (gnt:uses_platform skos:definition "The Platform this resource uses") + (gnt:has_geo_series_id rdfs:domain dcat:Dataset) + (gnt:has_geo_series_id a owl:ObjectProperty) + (gnt:has_geo_series_id skos:definition "id of record in NCBI database") + (gnt:has_experiment_type rdfs:domain dcat:Dataset) + (gnt:has_experiment_type a owl:ObjectProperty) + (gnt:has_experiment_type rdfs:label "Experiment Type Metadata") + (gnt:has_experiment_type skos:definition "Information about the experiment type") + (gnt:has_tissue_info rdfs:domain dcat:Dataset) + (gnt:has_tissue_info a owl:ObjectProperty) + (gnt:has_tissue_info skos:definition "Metadata about Tissue for this resource") + (gnt:has_experiment_design_info rdfs:domain dcat:Dataset) + (gnt:has_experiment_design_info rdfs:label "Experiment Design") + (gnt:has_experiment_design_info a owl:ObjectProperty) + (gnt:has_experiment_design_info skos:definition "Information about how the experiment was designed") + (gnt:has_notes rdfs:domain dcat:Dataset) + (gnt:has_notes a owl:ObjectProperty) + (gnt:has_notes rdfs:label "Notes") + (gnt:has_notes skos:definition "Extra Notes about this dataset") + (gnt:has_data_processing_info rdfs:domain dcat:Dataset) + (gnt:has_data_processing_info rdfs:label "About Data Processing") + (gnt:has_data_processing_info a owl:ObjectProperty) + (gnt:has_data_processing_info skos:definition "Information about how this dataset was processed") + (gnt:has_platform_info rdfs:domain dcat:Dataset) + (gnt:has_platform_info a owl:ObjectProperty) + (gnt:has_platform_info rdfs:label "About Platform") + (gnt:has_platform_info skos:definition "Information about the platform that was used with this dataset") + (gnt:has_case_info rdfs:domain dcat:Dataset) + (gnt:has_case_info rdfs:label "About Case") + (gnt:has_case_info a owl:ObjectProperty) + (gnt:has_case_info skos:definition "Information about the cases used in this platform") + (gnt:has_summary rdfs:domain dcat:Dataset) + (gnt:has_summary rdfs:label "Summary") + (gnt:has_summary a owl:ObjectProperty) + (gnt:has_summary skos:definition "Summary information about dataset") + (gnt:has_citation rdfs:domain dcat:Dataset) + (gnt:has_citation rdfs:label "Citation") + (gnt:has_citation a owl:ObjectProperty) + (gnt:has_citation skos:definition "Citation for this dataset") + (gnt:has_contributors rdfs:domain dcat:Dataset) + (gnt:has_contributors rdfs:label "Contributors") + (gnt:has_contributors a owl:ObjectProperty) + (gnt:has_contributors skos:definition "Contributors of this resource") + (gnt:has_experiment_design rdfs:domain dcat:Dataset) + (gnt:has_experiment_design rdfs:label "Experiment Design") + (gnt:has_experiment_design a owl:ObjectProperty) + (gnt:has_experiment_design skos:definition "Experiment Design for this resource") + (gnt:has_tissue_info rdfs:domain dcat:Dataset) + (gnt:has_tissue_info rdfs:label "Tissue Information") + (gnt:has_tissue_info a owl:ObjectProperty) + (gnt:has_tissue_info skos:definition "Tissue information about dataset") + (gnt:has_experiment_type skos:definition "Information about the experiment type") + (gnt:has_acknowledgement rdfs:domain dcat:Dataset) + (gnt:has_acknowledgement rdfs:label "Acknowledgement") + (gnt:has_acknowledgement a owl:ObjectProperty) + (gnt:has_acknowledgement skos:definition "People to acknowledge")) + (triples + (string->identifier + "" (let ((info-page-name (field InfoFiles InfoPageName)) + (info-title (field InfoFiles Title))) + (format #f "~a" + (if (and (string? info-page-name) + (string=? (string-downcase (string-trim-both info-page-name)) + "none")) + info-title info-page-name)))) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder (let ([dataset-type (string-trim-both - (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:Genotype', IF(PublishFreeze.Id IS NOT NULL, 'gnc:Phenotype', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:Probeset', '')))" + (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotype', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotype', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probeset', '')))" DatasetType)))]) (if (not (string-null? dataset-type)) (string->symbol dataset-type) ""))) - (set rdfs:label (regexp-substitute/global - #f "^[Nn]one$" - (field InfoFiles InfoPageName) - "")) + (set rdfs:label (normalize-string-field (field InfoFiles InfoPageName))) (set skos:prefLabel - (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" - DatasetFullName))) + (normalize-string-field + (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" + DatasetFullName)))) (set skos:altLabel (field Datasets DatasetName DatasetGroup)) - (set dct:title - (regexp-substitute/global - #f "^[Nn]one$" - (or - (regexp-substitute/global - #f "^Unpublished$" (field Datasets PublicationTitle) "") - (field InfoFiles InfoFileTitle) - "") - "")) + (set dct:title (normalize-string-field (field Datasets PublicationTitle))) (set dct:created - (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" - createTimeGenoFreeze))) + (normalize-string-field + (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" + createTimeGenoFreeze)))) (set dcat:contactPoint (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) - (field Investigators Email))) + "")) (set foaf:Organization (field Organizations OrganizationName)) (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId))) (set dct:accessRights (string-downcase (field DatasetStatus DatasetStatusName))) - (set gnt:belongsToGroup + (set gnt:belongs_to_group (string->identifier "set" (field ("IFNULL(InbredSet.Name, IFNULL(PublishInbredSet.Name, GenoInbredSet.Name))" - InbredSetName)))) - (set gnt:hasTissue (string->identifier "tissue" - (field Tissue Short_Name))) - (set gnt:usesNormalization - (string->identifier "avgMethod" - ;; If AvgMethodName is NULL, assume N/A. - (if (string-blank? (field AvgMethod Name AvgMethodName)) - "N/A" (field AvgMethod Name AvgMethodName)))) - (set gnt:usesPlatform + InbredSetName)) + #:separator "_")) + (set gnt:has_tissue (string->identifier "tissue" + (field Tissue Short_Name) + #:separator "_")) + (set gnt:uses_normalization + (let ((avg-method (normalize-string-field (field AvgMethod Name AvgMethodName)))) + (if (not (string-blank? avg-method)) + (string->identifier "avg_method" avg-method #:separator "_") + ""))) + (set gnt:has_summary + (let* ((summary-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/summary.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (summary + (field InfoFiles Summary))) + (if (or (null? summary) (string-blank? summary)) + "" (string->symbol summary-link)))) + (set gnt:has_tissue_info + (let* ((tissue-info-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/tissue.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (tissue-info + (field Datasets AboutTissue))) + (if (or (null? tissue-info) (string-blank? tissue-info)) + "" (string->symbol tissue-info-link)))) + (set gnt:has_citation + (let* ((citation-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/citation.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (citation + (field Datasets Citation))) + (if (or (null? citation) (string-blank? citation)) + "" (string->symbol citation-link)))) + (set gnt:hasSpecifics + (let* ((specifics-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/specifics.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (specifics + (field InfoFiles Specifics))) + (if (or (null? specifics) (string-blank? specifics)) + "" (string->symbol specifics-link)))) + (set gnt:has_case_info + (let* ((cases-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/cases.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (cases + (field Datasets AboutCases))) + (if (or (null? cases) (string-blank? cases)) + "" (string->symbol cases-link)))) + (set gnt:has_platform_info + (let* ((platform-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/platform.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (platform + (field Datasets AboutPlatform))) + (if (or (null? platform) (string-blank? platform)) + "" (string->symbol platform-link)))) + (set gnt:has_data_processing_info + (let* ((processing-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/processing.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (processing + (field Datasets AboutDataProcessing))) + (if (or (null? processing) (string-blank? processing)) + "" (string->symbol processing-link)))) + (set gnt:has_notes + (let* ((notes-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/notes.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (notes + (field Datasets Notes))) + (if (or (null? notes) (string-blank? notes)) + "" (string->symbol notes-link)))) + (set gnt:has_experiment_type + (let* ((experiment-type-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-type.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (experiment-type + (field InfoFiles Experiment_Type))) + (if (or (null? experiment-type) (string-blank? experiment-type)) + "" (string->symbol experiment-type-link)))) + (set gnt:has_experiment_design + (let* ((experiment-design-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-design.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (experiment-design + (field Datasets ExperimentDesign))) + (if (or (null? experiment-design) (string-blank? experiment-design)) + "" (string->symbol experiment-design-link)))) + (set gnt:has_contributors + (let* ((contributors-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/contributors.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (contributors + (field Datasets Contributors))) + (if (or (null? contributors) (string-blank? contributors)) + "" (string->symbol contributors-link)))) + (set gnt:has_acknowledgement + (let* ((acknowledgment-link + (format + #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/acknowledgment.rtf>" + (string-capitalize-first + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post)))) + (acknowledgment + (field Datasets Acknowledgment))) + (if (or (null? acknowledgment) (string-blank? acknowledgment)) + "" (string->symbol acknowledgment-link)))) + (set gnt:uses_platform (string->identifier "platform" - (field GeneChip Name GeneChip))) - (set gnt:hasGeoSeriesId + (field GeneChip Name GeneChip) + #:separator "_")) + (set gnt:has_geo_series_id (let ((s (string-match "GSE[0-9]*" (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) @@ -224,24 +389,19 @@ (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") (triples - (string->identifier - "" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field PublishFreeze Name) - 'pre "_" 'post)) + (string->identifier "" (field PublishFreeze Name)) (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:Phenotype) + (set xkos:classifiedUnder 'gnc:phenotype) (set dct:title (field PublishFreeze FullName)) (set rdfs:label (field PublishFreeze Name)) (set skos:altLabel (field PublishFreeze ShortName)) (set dct:created (annotate-field (field PublishFreeze CreateTime) '^^xsd:date)) - (set gnt:belongsToGroup + (set gnt:belongs_to_group (string->identifier "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first)))) + #:separator "_")))) (define-transformer genofreeze (tables (GenoFreeze @@ -249,28 +409,20 @@ (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field GenoFreeze Name) - 'pre "_" 'post) - 'pre "_" 'post)) + (string->identifier "" (field GenoFreeze Name)) (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:Genotype) + (set xkos:classifiedUnder 'gnc:genotype) (set rdfs:label (field GenoFreeze Name)) (set dct:title (field GenoFreeze FullName)) (set skos:altLabel (field GenoFreeze ShortName)) (set dct:created (annotate-field (field GenoFreeze CreateTime) '^^xsd:date)) - (set gnt:belongsToGroup + (set gnt:belongs_to_group (string->identifier "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first)))) + #:separator "_" + #:proc (lambda (x) x))))) ;; Molecular Traits are also referred to as ProbeSets (define-transformer probesetfreeze @@ -282,24 +434,19 @@ (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") (schema-triples - (gnt:usesNormalization rdfs:domain gnc:probeset) - (gnt:usesDataScale rdfs:domain gnc:probeset) - (gnt:usesDataScale a owl:ObjectProperty) - (gnt:usesDataScale skos:definition "Thi data scale this resource uses")) + (gnt:uses_normalization rdfs:domain gnc:probeset) + (gnt:uses_data_scale rdfs:domain gnc:probeset) + (gnt:uses_data_scale a owl:ObjectProperty) + (gnt:uses_data_scale skos:definition "Thi data scale this resource uses")) (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ProbeSetFreeze Name) - 'pre "_" 'post)) + (string->identifier "" (field ProbeSetFreeze Name)) (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:Probeset) - (set gnt:usesNormalization - (string->identifier "avgMethod" - ;; If AvgMethodName is NULL, assume N/A. - (if (string-blank? (field AvgMethod Name AvgMethodName)) - "N/A" (field AvgMethod Name AvgMethodName)))) + (set xkos:classifiedUnder 'gnc:probeset) + (set gnt:uses_normalization + (let ((avg-method (field AvgMethod Name AvgMethodName))) + (if (string-blank? avg-method) + #f + avg-method))) (set dct:title (field ProbeSetFreeze FullName)) (set rdfs:label (field ProbeSetFreeze ShortName)) (set skos:prefLabel (field ProbeSetFreeze Name)) @@ -307,16 +454,9 @@ (set dct:created (annotate-field (field ProbeSetFreeze CreateTime) '^^xsd:datetime)) - (set gnt:usesDataScale (field ProbeSetFreeze DataScale)) - (set gnt:hasTissue - (string->identifier - "tissue" - (field Tissue Short_Name))) - (set gnt:belongsToGroup - (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first)))) + (set gnt:uses_data_scale (field ProbeSetFreeze DataScale)) + (set gnt:has_tissue (string->identifier "tissue" (field Tissue Short_Name) #:separator "_")) + (set gnt:belongs_to_group (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) @@ -343,9 +483,9 @@ ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") - ("gnt:" "<http://genenetwork.org/term/>") - ("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") ("owl:" "<http://www.w3.org/2002/07/owl#>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") diff --git a/examples/genbank.scm b/examples/genbank.scm index 391cff0..149ac0f 100755 --- a/examples/genbank.scm +++ b/examples/genbank.scm @@ -10,35 +10,22 @@ (transform strings) (transform sql) (transform triples) - (transform special-forms) - (transform uuid)) + (transform special-forms)) -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) - (define-transformer genbank (tables (Genbank (left-join Species "USING (SpeciesId)"))) (schema-triples (gnc:nucleotide a skos:Concept) - (gnt:hasSequence rdfs:domain gnc:nucleotide)) + (gnt:has_sequence rdfs:domain gnc:nucleotide)) (triples (ontology 'genbank: (field Genbank Id)) - (set gnt:hasSequence (field Genbank Sequence)) - (set gnt:belongsToSpecies - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)))) + (set gnt:has_sequence (field Genbank Sequence)) + (set gnt:belongs_to_species + (string->identifier "" (remap-species-identifiers (field Species Fullname)))))) @@ -63,9 +50,9 @@ ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") - ("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gnt:" "<http://genenetwork.org/term/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("dct:" "<http://purl.org/dc/terms/>") ("foaf:" "<http://xmlns.com/foaf/0.1/>") ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") diff --git a/examples/genelist.scm b/examples/genelist.scm index 9c1ced0..60ae4cd 100755 --- a/examples/genelist.scm +++ b/examples/genelist.scm @@ -18,73 +18,72 @@ (tables (GeneList (left-join Species "USING (SpeciesId)"))) (schema-triples - (gnt:gene rdfs:domain gnc:GeneSymbol) - (gnt:belongsToSpecies rdfs:domain gnc:GeneSymbol) - (gnc:Gene a rdfs:Class) - (gnc:Gene rdfs:label "Gene") - (gnt:hasGeneId a owl:ObjectProperty) - (gnt:hasGeneId rdfs:domain gnc:NCBIWikiEntry) - (gnt:hasGeneId skos:definition "The GeneId of this this resource") - (gnc:transcript rdfs:domain gnc:GeneSymbol) + (gnc:gene_symbol a rdfs:Class) + (gnc:gene_symbol rdfs:label "A gene symbol") + (gnt:gene rdfs:domain gnc:gene_symbol) + (gnt:belongs_to_species rdfs:domain gnc:gene_symbol) + (gnc:gene a rdfs:Class) + (gnc:gene rdfs:label "Gene") + (gnt:has_gene_id a owl:ObjectProperty) + (gnt:has_gene_id rdfs:domain gnc:ncbi_wiki_entry) + (gnt:has_gene_id skos:definition "The GeneId of this this resource") + (gnc:transcript rdfs:domain gnc:gene_symbol) (gnt:transcript a owl:ObjectProperty) (gnc:transcript rdfs:comments "The gene transcript of this resource") - (gnc:ebiGwasLink rdfs:Class gnc:ResourceLink) - (gnc:ebiGwasLink rdfs:label "EBI GWAS") - (gnc:ebiGwasLink rdfs:comments "EBI GWAS") - (gnc:proteinAtlasLink rdfs:Class gnc:ResourceLink) - (gnc:proteinAtlasLink rdfs:label "Protein Atlas") - (gnc:proteinAtlasLink rdfs:comments "Human Protein Atlas") - (gnc:genemaniaLink rdfs:Class gnc:ResourceLink) - (gnc:genemaniaLink rdfs:label "GeneMANIA") - (gnc:genemaniaLink rdfs:comments "GeneMANIA") - (gnc:gemmaLink rdfs:Class gnc:ResourceLink) - (gnc:gemmaLink rdfs:label "Gemma") - (gnc:gemmaLink rdfs:comments "Meta-analysis of gene expression data") - (gnc:biogpsLink rdfs:Class gnc:ResourceLink) - (gnc:biogpsLink rdfs:label "BioGPS") - (gnc:biogpsLink rdfs:comments "Expression across many tissues and cell types") - (gnc:abaLink rdfs:Class gnc:ResourceLink) - (gnc:abaLink rdfs:label "ABA") - (gnc:abaLink rdfs:comments "Allen Brain Atlas") - (gnc:pantherLink rdfs:Class gnc:ResourceLink) - (gnc:pantherLink rdfs:label "PANTHER") - (gnc:pantherLink rdfs:comments "Gene and protein data resources from Celera-ABI") - (gnc:stringLink rdfs:Class gnc:ResourceLink) - (gnc:stringLink rdfs:label "STRING") - (gnc:stringLink rdfs:comments "Protein interactions: known and inferred") - (gnc:gtexLink rdfs:Class gnc:ResourceLink) - (gnc:gtexLink rdfs:label "GTEx Portal") - (gnc:gtexLink rdfs:comments "GTEx Portal") - (gnc:rgdLink rdfs:Class gnc:ResourceLink) - (gnc:rgdLink rdfs:label "Rat Genome DB") - (gnc:rgdLink rdfs:comments "Rat Genome DB") - (gnc:hasKgID rdfs:domain gnc:GeneSymbol) - (gnt:hasKgID a owl:ObjectProperty) - (gnc:hasKgID rdfs:comments "The kgID of this resource") - (gnc:hasUnigenID rdfs:domain gnc:GeneSymbol) - (gnt:hasUnigenID a owl:ObjectProperty) - (gnc:hasUnigenID rdfs:comments "The UnigenID of this resource") - (gnc:hasProteinID rdfs:domain gnc:GeneSymbol) - (gnt:hasProteinID a owl:ObjectProperty) - (gnc:hasProteinID rdfs:comments "The ProteinID of this resource") - (gnc:hasAlignID rdfs:domain gnc:GeneSymbol) - (gnt:hasAlignID a owl:ObjectProperty) - (gnc:hasAlignID rdfs:comments "The AlignID of this resource") - (gnt:TxEnd rdfs:range xsd:double) - (gnt:TxStart rdfs:range xsd:double) - (gnt:hasTargetSeq rdfs:domain gnc:Probeset)) + (gnc:ebi_gwas_link rdfs:Class gnc:ResourceLink) + (gnc:ebi_gwas_link rdfs:label "EBI GWAS") + (gnc:ebi_gwas_link rdfs:comments "EBI GWAS") + (gnc:protein_atlas_link rdfs:Class gnc:ResourceLink) + (gnc:protein_atlas_link rdfs:label "Protein Atlas") + (gnc:protein_atlas_link rdfs:comments "Human Protein Atlas") + (gnc:genemania_link rdfs:Class gnc:ResourceLink) + (gnc:genemania_link rdfs:label "GeneMANIA") + (gnc:genemania_link rdfs:comments "GeneMANIA") + (gnc:gemma_link rdfs:Class gnc:ResourceLink) + (gnc:gemma_link rdfs:label "Gemma") + (gnc:gemma_link rdfs:comments "Meta-analysis of gene expression data") + (gnc:biogps_link rdfs:Class gnc:ResourceLink) + (gnc:biogps_link rdfs:label "BioGPS") + (gnc:biogps_link rdfs:comments "Expression across many tissues and cell types") + (gnc:aba_link rdfs:Class gnc:ResourceLink) + (gnc:aba_link rdfs:label "ABA") + (gnc:aba_link rdfs:comments "Allen Brain Atlas") + (gnc:panther_link rdfs:Class gnc:ResourceLink) + (gnc:panther_link rdfs:label "PANTHER") + (gnc:panther_link rdfs:comments "Gene and protein data resources from Celera-ABI") + (gnc:panther_link rdfs:Class gnc:ResourceLink) + (gnc:panther_link rdfs:label "STRING") + (gnc:panther_link rdfs:comments "Protein interactions: known and inferred") + (gnc:gtex_link rdfs:Class gnc:ResourceLink) + (gnc:gtex_link rdfs:label "GTEx Portal") + (gnc:gtex_link rdfs:comments "GTEx Portal") + (gnc:rgd_link rdfs:Class gnc:ResourceLink) + (gnc:rgd_link rdfs:label "Rat Genome DB") + (gnc:rgd_link rdfs:comments "Rat Genome DB") + (gnc:has_kg_id rdfs:domain gnc:gene_symbol) + (gnc:has_kg_id a owl:ObjectProperty) + (gnc:has_kg_id rdfs:comments "The kgID of this resource") + (gnc:has_unigen_id rdfs:domain gnc:gene_symbol) + (gnc:has_unigen_id a owl:ObjectProperty) + (gnc:has_unigen_id rdfs:comments "The UnigenID of this resource") + (gnc:has_protein_id rdfs:domain gnc:gene_symbol) + (gnt:has_protein_id a owl:ObjectProperty) + (gnc:has_protein_id rdfs:comments "The ProteinID of this resource") + (gnc:has_align_id rdfs:domain gnc:gene_symbol) + (gnt:has_align_id a owl:ObjectProperty) + (gnc:has_align_id rdfs:comments "The AlignID of this resource") + (gnt:tx_end rdfs:range xsd:double) + (gnt:tx_start rdfs:range xsd:double) + (gnt:has_target_seq rdfs:domain gnc:probeset)) (triples (string->identifier - "gene" (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (string-trim-both - (field ("CONCAT_WS('_', GeneSymbol, GeneID, AlignID)" GENE_UID))) - 'pre "_" 'post) - #:proc (lambda (x) x)) - (set rdf:type 'gnc:Gene) - (set gnt:geneSymbol (field GeneList GeneSymbol)) + "gene" (normalize-string-field (string-trim-both + (field ("CONCAT_WS('_', GeneSymbol, GeneID, AlignID)" GENE_UID)))) + #:separator "_") + (set rdf:type 'gnc:gene) + (set gnt:gene_symbol (field GeneList GeneSymbol)) (set dct:description (sanitize-rdf-string (field GeneList GeneDescription))) - (set gnt:hasGeneId (ontology 'gene: (field GeneList GeneId))) + (set gnt:has_gene_id (ontology 'gene: (field GeneList GeneId))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) @@ -94,7 +93,7 @@ "https://www.ebi.ac.uk/gwas/search?query=" (uri-encode (string-trim-both symbol)) - "a gnc:ebiGwasLink")) + "a gnc:ebi_gwas_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol)) @@ -107,7 +106,7 @@ (string->symbol (format #f "<~0@*~a> .~%<~0@*~a> ~1@*~a" "http://mouse.brain-map.org/search/show?search_type=gene&search_term=" - "a gnc:abaLink" + "a gnc:aba_link" (if (string=? species "mouse") (uri-encode (string-trim-both symbol)) @@ -129,7 +128,7 @@ (string-trim-both symbol)) "&category=Gene&species=" (string-capitalize species) - "a gnc:rgdLink")) + "a gnc:rgd_link")) ""))) (set dct:references (let ((geneId (field GeneList GeneID)) @@ -147,7 +146,7 @@ species "#goto=genereport&id=" geneId - "a gnc:biogpsLink")) + "a gnc:biogps_link")) ""))) (set dct:references (let ((geneId (field GeneList GeneID))) @@ -157,7 +156,7 @@ "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.chibi.ubc.ca/Gemma/gene/showGene.html?ncbiid=" geneId - "a gnc:gemmaLink")) + "a gnc:gemma_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol)) @@ -175,7 +174,7 @@ species (uri-encode (string-trim-both symbol)) - "a gnc:genemaniaLink")) + "a gnc:genemania_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) @@ -186,7 +185,7 @@ "http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue=" (uri-encode (string-trim-both symbol)) - "a gnc:pantherLink")) + "a gnc:panther_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) @@ -197,7 +196,7 @@ "http://string-db.org/newstring_cgi/show_network_section.pl?identifier=" (uri-encode (string-trim-both symbol)) - "a gnc:stringLink")) + "a gnc:panther_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) @@ -208,7 +207,7 @@ "https://www.gtexportal.org/home/gene/" (uri-encode (string-trim-both symbol)) - "a gnc:gtexLink")) + "a gnc:gtex_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) @@ -219,33 +218,27 @@ "http://www.proteinatlas.org/search/" (uri-encode (string-trim-both symbol)) - "a gnc:proteinAtlasLink")) + "a gnc:protein_atlas_link")) ""))) (set gnt:chromosome (field GeneList Chromosome)) - (set gnt:TxStart (annotate-field + (set gnt:tx_start (annotate-field (field GeneList TxStart) '^^xsd:double)) - (set gnt:TxEnd (annotate-field + (set gnt:tx_end (annotate-field (field GeneList TxEnd) '^^xsd:double)) - (set gnt:Strand (string-trim-both (field GeneList Strand))) + (set gnt:strand (string-trim-both (field GeneList Strand))) (set - gnt:belongsToSpecies - (string->identifier - "" - (remap-species-identifiers - (string-trim-both (field Species Name))) - #:separator "" - #:proc string-capitalize-first)) + gnt:belongs_to_species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) (set gnt:transcript (ontology 'transcript: (string-trim-both (field GeneList NM_ID)))) - (set gnt:hasKgID (string-trim-both (field GeneList kgID))) - (set gnt:hasUnigenID (string-trim-both (field GeneList UnigenID))) - (set gnt:hasProteinID (string-trim-both (field GeneList ProteinID))) - (set gnt:hasAlignID (string-trim-both (field GeneList AlignID))) - (set gnt:hasRgdID + (set gnc:has_kg_id (string-trim-both (field GeneList kgID))) + (set gnc:has_unigen_id (string-trim-both (field GeneList UnigenID))) + (set gnt:has_protein_id (string-trim-both (field GeneList ProteinID))) + (set gnt:has_align_id (string-trim-both (field GeneList AlignID))) + (set gnt:has_rgd_id (field ("IFNULL(RGD_ID, '')" RGD_ID))))) (define-transformer genelist-rn33 @@ -257,25 +250,26 @@ (if (number? gene-uid) (number->string gene-uid) - gene-uid))) - (set rdf:type 'gnc:Gene) - (set gnt:belongsToSpecies 'gn:Rattus_norvegicus) - (set gnt:geneSymbol (string-trim-both (field GeneList_rn33 geneSymbol))) + gene-uid) + #:separator "_")) + (set rdf:type 'gnc:gene) + (set gnt:belongs_to_species 'gn:Rattus_norvegicus) + (set gnt:gene_symbol (string-trim-both (field GeneList_rn33 geneSymbol))) (set gnt:chromosome (field GeneList_rn33 chromosome)) - (set gnt:TxStart (annotate-field + (set gnt:tx_start (annotate-field (field GeneList_rn33 txStart) '^^xsd:double)) - (set gnt:TxEnd (annotate-field + (set gnt:tx_end (annotate-field (field GeneList_rn33 txEnd) '^^xsd:double)) - (set gnt:Strand (string-trim-both (field GeneList_rn33 strand))) + (set gnt:strand (string-trim-both (field GeneList_rn33 strand))) (set gnt:transcript (ontology 'transcript: (string-trim-both (field GeneList_rn33 NM_ID)))) (set - gnt:hasKgID + gnc:has_kg_id (string-trim-both (field GeneList_rn33 kgID))) (set dct:references (let ((symbol (field GeneList_rn33 geneSymbol))) @@ -295,7 +289,7 @@ "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "https://www.ebi.ac.uk/gwas/search?query=" (string-trim-both symbol) - "a gnc:ebiGwasLink")) + "a gnc:ebi_gwas_link")) ""))) (set dct:references (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) @@ -306,7 +300,7 @@ "http://string-db.org/newstring_cgi/show_network_section.pl?identifier=" (uri-encode (string-trim-both symbol)) - "a gnc:stringLink")) + "a gnc:panther_link")) ""))) (set dct:references (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) @@ -317,7 +311,7 @@ "https://www.gtexportal.org/home/gene/" (uri-encode (string-trim-both symbol)) - "a gnc:gtexLink")) + "a gnc:gtex_link")) ""))) (set dct:references (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) @@ -328,7 +322,7 @@ "http://www.proteinatlas.org/search/" (uri-encode (string-trim-both symbol)) - "a gnc:proteinAtlasLink")) + "a gnc:protein_atlas_link")) ""))))) @@ -349,10 +343,10 @@ (connection %connection-settings) (table-metadata? #f) (prefixes - '(("gn:" "<http://genenetwork.org/id/>") - ("probeset:" "<http://genenetwork.org/probeset/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gnt:" "<http://genenetwork.org/term/>") + '(("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("probeset:" "<http://rdf.genenetwork.org/v1/probeset/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("dct:" "<http://purl.org/dc/terms/>") diff --git a/examples/generif.scm b/examples/generif.scm index fb3208a..c4b70ae 100755 --- a/examples/generif.scm +++ b/examples/generif.scm @@ -11,177 +11,124 @@ (transform strings) (transform sql) (transform triples) - (transform special-forms) - (transform uuid)) + (transform special-forms)) -(define (fix-email-id email) - (string-delete #\space email)) - -(define (investigator-attributes->id first-name last-name email) - ;; There is just one record corresponding to "Evan Williams" which - ;; does not have an email ID. To accommodate that record, we - ;; construct the investigator ID from not just the email ID, but - ;; also the first and the last names. It would be preferable to just - ;; find Evan Williams' email ID and insert it into the database. - (string->identifier "investigator" - (string-join - (list first-name last-name (fix-email-id email)) - "_"))) - - - -(define-transformer genewiki-symbols - (tables (GeneRIF_BASIC) - "GROUP BY BINARY symbol") - (triples - (string->identifier - "symbol" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field GeneRIF_BASIC symbol) - 'pre "_" 'post) - #:proc (lambda (x) x)) - (set rdfs:label - (field GeneRIF_BASIC symbol)))) - (define-transformer gn-genewiki-entries (tables (GeneRIF (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId") (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") - (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id") - (left-join Investigators "ON Investigators.Email = GeneRIF.email")) - "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, BINARY GeneRIF.symbol") + (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")) + "WHERE GeneRIF.display > 0 AND GeneRIF.comment IS NOT NULL +GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol") (schema-triples - (gnc:GeneWikiEntry a rdfs:Class) - (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) - (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork") - (gnt:geneSymbol rdfs:domain gnc:GNWikiEntry)) + (gnc:gene_wiki_entry a rdfs:Class) + (gnc:gn_wiki_entry rdfs:subClassOf gnc:gene_wiki_entry) + (gnt:initial a owl:ObjectProperty) + (gnt:initial rdfs:domain gnc:gene_wiki_entry) + (gnt:initial skos:definition "Optional user or project code or your initials") + (gnt:reason a owl:ObjectProperty) + (gnt:reason rdfs:domain gnc:gene_wiki_entry) + (gnt:reason skos:definition "The reason why this resource was modified") + (gnc:gn_wiki_entry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork") + (gnt:gene_symbol rdfs:domain gnc:gn_wiki_entry)) (triples (string->identifier - "symbol" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field GeneRIF symbol) - 'pre "_" 'post) - #:proc (lambda (x) x)) - (set rdfs:comment - (let* ([generif-comment (sanitize-rdf-string (field GeneRIF comment))] - [create-time (field GeneRIF createtime EntryCreateTime)] - [pmid (field GeneRIF PubMed_ID PMID)] - [web-url (field GeneRIF weburl)] - [species (string->identifier - "" - (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)] - [categories - (remove (lambda (x) - (or (eq? x #f) - (and (string? x) - (string-null? x)))) - (remove-duplicates - (string-split-substring - (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')" - GeneCategory)) - "$$")))]) - (string->symbol - (string-append - "[ " - (format #f "rdf:type gnc:GNWikiEntry ; ") - (if (string? species) - "" - (format #f "gnt:belongsToSpecies ~a ; " - species)) - (format #f "rdfs:comment ~s^^xsd:string ; " - generif-comment) - (if (string? create-time) - "" - (format #f "dct:created ~s^^xsd:datetime ; " - (time-unix->string - create-time "~5"))) - (if (and (string? pmid) (not (string-null? pmid))) - (format #f - "~{dct:references pubmed:~a ; ~}" - (string-split pmid #\space)) - "") - (if (and (not (string-null? - (string-trim-both (field GeneRIF email)))) - (not (string-null? (field Investigators Email)))) - (format #f "dct:creator ~a ; " - (investigator-attributes->id - (field Investigators FirstName) - (field Investigators LastName) - (field Investigators Email))) - "") - (if (not (null? categories)) - (format #f - "~{gnt:belongsToCategory ~s ; ~}" - categories) - "") - (if (and (string? web-url) (not (string-null? web-url))) - (format #f "foaf:homepage ~s ; " - web-url) - "") - " ] ")))))) + "wiki" (format #f "~a_~a" + (field GeneRIF Id) + (field GeneRIF versionId)) + #:separator "_") + (set rdfs:label (string->symbol + (format #f "'~a'@en" + (replace-substrings + (sanitize-rdf-string + (field GeneRIF comment)) + '(("'" . "\\'")))))) + (set rdf:type 'gnc:gn_wiki_entry) + (set gnt:symbol (field GeneRIF symbol)) + (set gnt:belongs_to_species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) + (set dct:created + (string->symbol + (format #f "~s^^xsd:datetime " + (field + ("CAST(createtime AS CHAR)" EntryCreateTime))))) + (multiset dct:references + (map (lambda (pmid) + (match pmid + ((? string-blank? p) "") + (p (string->symbol + (format #f "pubmed:~a" (string-trim-both pmid)))))) + (string-split (field GeneRIF PubMed_ID PMID) + #\space))) + (set foaf:mbox + (match (sanitize-rdf-string (field GeneRIF email)) + ((? string-blank? mbox) "") + (mbox (string->symbol + (format #f "<~a>" mbox))))) + (set dct:identifier (annotate-field (format #f "~s" (field GeneRIF Id)) + '^^xsd:integer)) + (set foaf:homepage + (match (sanitize-rdf-string (field GeneRIF weburl)) + ((? string-blank? homepage) "") + (homepage (string->symbol + (format #f "<~a>" homepage))))) + (set dct:hasVersion (annotate-field (format #f "~s" (field GeneRIF versionId)) + '^^xsd:integer)) + (set gnt:initial (sanitize-rdf-string (field GeneRIF initial))) + (set gnt:reason (field GeneRIF reason)) + (multiset gnt:belongs_to_category + (string-split + (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR ';')" + GeneCategory)) + #\;)))) (define-transformer ncbi-genewiki-entries (tables (GeneRIF_BASIC - (left-join Species "USING (SpeciesId)")) - "WHERE GeneRIF_BASIC.comment IS NOT NULL AND TRIM(GeneRIF_BASIC.comment) != '' AND TRIM(GeneRIF_BASIC.symbol) != '' GROUP BY GeneRIF_BASIC.comment, GeneRIF_BASIC.createtime, GeneRIF_BASIC.VersionId, GeneRIF_BASIC.SpeciesId, GeneRIF_BASIC.TaxID") + (left-join Species "USING (SpeciesId)"))) (schema-triples - (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) - (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI") - (gnt:hasVersionId a owl:ObjectProperty) - (gnt:hasVersionId rdfs:domain gnc:NCBIWikiEntry) - (gnt:hasVersionId skos:definition "The VersionId of this this resource")) + (gnc:ncbi_wiki_entry rdfs:subClassOf gnc:gene_wiki_entry) + (gnc:ncbi_wiki_entry rdfs:comment "Represents GeneRIF Entries obtained from NCBI")) (triples (string->identifier - "symbol" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field GeneRIF_BASIC symbol GeneRIFSymbol) - 'pre "_" 'post) - #:proc (lambda (x) x)) - (set rdfs:comment - (let ([ncbi-comment (sanitize-rdf-string (field GeneRIF_BASIC comment))] - [species-name - (string->identifier - "" - (remap-species-identifiers (field Species Fullname SpeciesFullName)) - #:separator "" - #:proc string-capitalize-first)] - [taxonomic-id (field GeneRIF_BASIC TaxID TaxonomicId)] - [create-time (field GeneRIF_BASIC createtime EntryCreateTime)] - [pmid (field GeneRIF_BASIC PubMed_ID PMID)] - [gene-id (field GeneRIF_BASIC GeneId)] - [version-id (field GeneRIF_BASIC VersionId)]) + "rif" (format #f "~a_~a_~a_~a" + (field GeneRIF_BASIC GeneId) + (field GeneRIF_BASIC PubMed_ID) + (field ("DATE_FORMAT(createtime, '%Y-%m-%dT%T')" CreateTime)) + (field GeneRIF_BASIC VersionId)) + #:separator "_") + (set rdf:type + (let* ((comment (format #f "'~a'@en" + (replace-substrings + (sanitize-rdf-string + (field GeneRIF_BASIC comment)) + '(("\\" . "\\\\") + ("\n" . "\\n") + ("\r" . "\\r") + ("'" . "\\'"))))) + (create-time (format #f "~s^^xsd:datetime" + (field + ("CAST(createtime AS CHAR)" EntryCreateTime)))) + (symbol (field GeneRIF_BASIC symbol)) + (species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) + (gene-id (field GeneRIF_BASIC GeneId)) + (taxon-id (field GeneRIF_BASIC TaxID TaxonomicId)) + (pmid (field GeneRIF_BASIC PubMed_ID)) + (version-id (field GeneRIF_BASIC versionId))) (string->symbol (string-append - "[ " - (format #f "rdf:type gnc:NCBIWikiEntry ; ") - (format #f "rdfs:comment ~s^^xsd:string ; " - ncbi-comment) - (format #f "gnt:belongsToSpecies ~a ; " - species-name) - (if (eq? #f taxonomic-id) - "" - (format #f "skos:notation taxon:~a ; " - taxonomic-id)) - (format #f "gnt:hasGeneId generif:~a ; " - gene-id) - (format #f "gnt:hasVersionId '~a'^^xsd:integer ; " - version-id) - (if (and (string? pmid) (not (string-null? pmid))) - (format #f - "~{dct:references pubmed:~a ; ~}" - (string-split pmid #\space)) - "") - (if (string? create-time) - "" - (format #f "dct:created ~s^^xsd:datetime ; " - (time-unix->string - create-time "~5"))) - " ]")))))) + (format #f "gnc:ncbi_wiki_entry ;\n") + (format #f "\trdfs:label ~a ;\n" comment) + (format #f "\tgnt:belongs_to_species ~a ;\n" species) + (format #f "\tgnt:symbol ~s ;\n" symbol) + (format #f "\tgnt:has_gene_id generif:~a ;\n" gene-id) + (match taxon-id + ((? number? x) + (format #f "\tskos:notation taxon:~a ;\n" taxon-id)) + (else "")) + (format #f "\tdct:hasVersion \"~a\"^^xsd:integer ;\n" version-id) + (format #f "\tdct:references pubmed:~a ;\n" pmid) + (format #f "\tdct:created ~a" create-time))))))) @@ -206,9 +153,9 @@ ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") - ("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gnt:" "<http://genenetwork.org/term/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("dct:" "<http://purl.org/dc/terms/>") ("foaf:" "<http://xmlns.com/foaf/0.1/>") ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") @@ -218,9 +165,9 @@ ("owl:" "<http://www.w3.org/2002/07/owl#>"))) (inputs (list - genewiki-symbols - gn-genewiki-entries - ncbi-genewiki-entries)) + ;; gn-genewiki-entries + ncbi-genewiki-entries + )) (outputs `(#:documentation ,documentation #:rdf ,output)))) diff --git a/examples/genotype.scm b/examples/genotype.scm index 7e72cf8..ac170be 100755 --- a/examples/genotype.scm +++ b/examples/genotype.scm @@ -21,61 +21,53 @@ (schema-triples (gnt:chr a owl:ObjectProperty) (gnt:chr skos:description "This resource is located on a given chromosome") - (gnt:chr rdfs:domain gnc:Genotype) + (gnt:chr rdfs:domain gnc:genotype) (gnt:mb a owl:ObjectProperty) (gnt:mb skos:definition "The size of this resource in Mb") - (gnt:mb rdfs:domain gnc:Genotype) - (gnt:mbMm8 a owl:ObjectProperty) - (gnt:mbMm8 skos:definition "TODO") - (gnt:mbMm8 rdfs:domain gnc:Genotype) + (gnt:mb rdfs:domain gnc:genotype) + (gnt:mb_mm8 a owl:ObjectProperty) + (gnt:mb_mm8 skos:definition "TODO") + (gnt:mb_mm8 rdfs:domain gnc:genotype) (gnt:mb2016 a owl:ObjectProperty) (gnt:mb2016 skos:definition "TODO") - (gnt:mb2016 rdfs:domain gnc:Genotype) - (gnt:hasSequence a owl:ObjectProperty) - (gnt:hasSequence skos:definition "This resource has a given sequence") - (gnt:hasSequence rdfs:domain gnc:Genotype) - (gnt:hasSource a owl:ObjectProperty) - (gnt:hasSource rdfs:domain gnc:Genotype) - (gnt:hasSource skos:definition "This resource was obtained from this given source") - (gnt:hasAltSourceName a owl:ObjectProperty) - (gnt:hasAltSourceName rdfs:domain gnc:Genotype) - (gnt:hasAltSourceName + (gnt:mb2016 rdfs:domain gnc:genotype) + (gnt:has_sequence a owl:ObjectProperty) + (gnt:has_sequence skos:definition "This resource has a given sequence") + (gnt:has_sequence rdfs:domain gnc:genotype) + (gnt:has_source a owl:ObjectProperty) + (gnt:has_source rdfs:domain gnc:genotype) + (gnt:has_source skos:definition "This resource was obtained from this given source") + (gnt:has_alt_source_name a owl:ObjectProperty) + (gnt:has_alt_source_name rdfs:domain gnc:genotype) + (gnt:has_alt_source_name skos:definition "The alternative name this resource was obtained from") - (gnt:chrNum a owl:ObjectProperty) - (gnt:chrNum rdfs:domain gnc:Genotype) - (gnt:chrNum skos:definition "The chromosome number for this resource")) + (gnt:chr_num a owl:ObjectProperty) + (gnt:chr_num rdfs:domain gnc:genotype) + (gnt:chr_num skos:definition "The chromosome number for this resource")) (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field Geno Name) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:Genotype) + (string->identifier "" (field Geno Name)) + (set rdf:type 'gnc:genotype) (set rdfs:label (sanitize-rdf-string (field Geno Name))) (set gnt:chr (field Geno Chr)) (set gnt:mb (annotate-field (field ("IFNULL(Geno.Mb, '')" Mb)) '^^xsd:double)) - (set gnt:mbMm8 (annotate-field (field ("IFNULL(Geno.Mb_mm8, '')" Mb_mm8)) + (set gnt:mb_mm8 (annotate-field (field ("IFNULL(Geno.Mb_mm8, '')" Mb_mm8)) '^^xsd:double)) (set gnt:mb2016 (annotate-field (field ("IFNULL(Geno.Mb_2016, '')" Mb_2016)) '^^xsd:double)) - (set gnt:hasSequence (field Geno Sequence)) - (set gnt:hasSource (field Geno Source)) + (set gnt:has_sequence (field Geno Sequence)) + (set gnt:has_source (field Geno Source)) ;; Only transform Source2 if it differs from Source - (set gnt:hasAltSourceName + (set gnt:has_alt_source_name (field ("IF((Source2 = Source), NULL, Source2)" Source2))) - (set gnt:belongsToSpecies - (string->identifier - "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:chrNum + (set gnt:belongs_to_species + (string->identifier "" (remap-species-identifiers (field Species Fullname)) + #:separator "_" + #:proc string-downcase)) + (set gnt:chr_num (annotate-field (field Geno chr_num) '^^xsd:int)) @@ -100,9 +92,9 @@ (table-metadata? #f) (prefixes '(("dct:" "<http://purl.org/dc/terms/>") - ("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gnt:" "<http://genenetwork.org/term/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("owl:" "<http://www.w3.org/2002/07/owl#>") diff --git a/examples/phenotype.scm b/examples/phenotype.scm index aa1e9c5..ae24d66 100755 --- a/examples/phenotype.scm +++ b/examples/phenotype.scm @@ -20,50 +20,50 @@ (left-join Publication "ON Publication.Id = PublishXRef.PublicationId") (left-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId"))) (schema-triples - (gnt:traitId a owl:ObjectProperty) - (gnt:traitId rdfs:domain gnc:Phenotype) - (gnt:traitId skos:definition "This is the unique trait id assigned from GeneNetwork") + (gnt:trait_id a owl:ObjectProperty) + (gnt:trait_id rdfs:domain gnc:phenotype) + (gnt:trait_id skos:definition "This is the unique trait id assigned from GeneNetwork") (gnt:abbreviation a owl:ObjectProperty) - (gnt:abbreviation rdfs:domain gnc:Phenotype) + (gnt:abbreviation rdfs:domain gnc:phenotype) (gnt:abbreviation skos:definition "The abbreviation used for this resource") (gnt:labCode a owl:ObjectProperty) - (gnt:labCode rdfs:domain gnc:Phenotype) + (gnt:labCode rdfs:domain gnc:phenotype) (gnt:submitter a owl:ObjectProperty) - (gnt:submitter rdfs:domain gnc:Phenotype) + (gnt:submitter rdfs:domain gnc:phenotype) (gnt:submitter skos:definition "A person who submitted this resource to GN") (gnt:mean a rdf:Property) (gnt:mean a qb:MeasureProperty) (gnt:mean rdfs:subPropertyOf sdmx-measure:obsValue) - (gnt:mean rdfs:domain gnc:Phenotype) + (gnt:mean rdfs:domain gnc:phenotype) (gnt:mean rdfs:range xsd:double) - (gnt:lodScore a rdf:Property) - (gnt:lodScore a qb:MeasureProperty) - (gnt:lodScore rdfs:subPropertyOf sdmx-measure:obsValue) - (gnt:lodScore rdfs:domain gnc:Phenotype) - (gnt:lodScore rdfs:range xsd:double) - (gnt:lodScore rdfs:label "Peak -logP") - (gnt:lodScore skos:definition "Statistical measurement assessing the likelihood of genetic linkage between traits or genetic markers.") + (gnt:lod_score a rdf:Property) + (gnt:lod_score a qb:MeasureProperty) + (gnt:lod_score rdfs:subPropertyOf sdmx-measure:obsValue) + (gnt:lod_score rdfs:domain gnc:phenotype) + (gnt:lod_score rdfs:range xsd:double) + (gnt:lod_score rdfs:label "Peak -logP") + (gnt:lod_score skos:definition "Statistical measurement assessing the likelihood of genetic linkage between traits or genetic markers.") (gnt:locus a rdf:Property) (gnt:locus a qb:MeasureProperty) (gnt:locus rdfs:subPropertyOf sdmx-measure:obsValue) - (gnt:locus rdfs:domain gnc:Phenotype) + (gnt:locus rdfs:domain gnc:phenotype) (gnt:locus rdfs:range rdfs:Literal) - (gnt:additive rdfs:domain gnc:Phenotype) + (gnt:additive rdfs:domain gnc:phenotype) (gnt:additive rdfs:range xsd:double) - (gnt:sequence rdfs:domain gnc:Phenotype) + (gnt:sequence rdfs:domain gnc:phenotype) (gnt:sequence rdfs:range xsd:integer)) (triples (string->identifier "trait" (field ("CONCAT(IFNULL(InbredSet.InbredSetCode, PublishXRef.InbredSetId), '_', PublishXRef.Id)" - Phenotype))) - (set rdf:type 'gnc:Phenotype) - (set gnt:belongsToGroup + Phenotype)) + #:separator "_") + (set rdf:type 'gnc:phenotype) + (set gnt:belongs_to_group (string->identifier "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first)) + #:separator "_")) ;; This is the trait's name - (set gnt:traitId + (set gnt:trait_id (let ((trait-id (field PublishXRef Id))) (if (number? trait-id) (number->string trait-id) @@ -83,16 +83,8 @@ (set dct:contributor (sanitize-rdf-string (field Phenotype Owner))) (set gnt:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) '^^xsd:double)) - (set gnt:locus - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (sanitize-rdf-string (field PublishXRef Locus)) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:lodScore (annotate-field + (set gnt:locus (sanitize-rdf-string (field PublishXRef Locus))) + (set gnt:lod_score (annotate-field (field ("IFNULL((PublishXRef.LRS/4.604), '')" lrs)) '^^xsd:double)) (set gnt:additive @@ -128,10 +120,10 @@ (table-metadata? #f) (prefixes '(("dct:" "<http://purl.org/dc/terms/>") - ("gn:" "<http://genenetwork.org/id/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gnt:" "<http://genenetwork.org/term/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") diff --git a/examples/probeset.scm b/examples/probeset.scm deleted file mode 100755 index 9f694af..0000000 --- a/examples/probeset.scm +++ /dev/null @@ -1,203 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 format) - (ice-9 getopt-long) - (ice-9 match) - (ice-9 regex) - (transform strings) - (transform sql) - (transform triples) - (transform special-forms) - (web uri)) - - -(define-transformer probeset - (tables (ProbeSet - (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId") - (left-join Species "ON GeneChip.SpeciesId = Species.Id")) - "WHERE ProbeSet.Name IS NOT NULL") - (schema-triples - (gnc:omimLink rdfs:Class gnc:ResourceLink) - (gnc:omimLink rdfs:label "OMIM") - (gnc:omimLink rdfs:comments "Summary from On Mendelion Inheritance in Man") - (gnc:homologeneLink rdfs:Class gnc:ResourceLink) - (gnc:homologeneLink rdfs:label "HomoloGene") - (gnc:homologeneLink rdfs:comments "Find similar genes in other species") - (gnc:uniprot a owl:ObjectProperty) - (gnc:uniprot rdfs:label "UniProt") - (gnc:uniprot rdfs:comments "UniProt resource") - (gnt:hasChip a owl:ObjectProperty) - (gnt:hasChip rdfs:domain gnc:Probeset) - (gnt:hasTargetId a owl:ObjectProperty) - (gnt:hasTargetId rdfs:domain gnc:Probeset) - (gnt:geneSymbol rdfs:domain gnc:Probeset) - (gnt:location rdfs:domain gnc:ProbeSet) - (gnt:location a owl:ObjectProperty) - (gnt:strandPosition rdfs:domain gnc:ProbeSet) - (gnt:strandPosition a owl:ObjectProperty) - (gnt:targetsRegion a owl:ObjectProperty) - (gnt:targetsRegion rdfs:domain gnc:Probeset) - (gnt:chr rdfs:domain gnc:Probeset) - (gnt:mb rdfs:domain gnc:Probeset) - (gnt:hasSpecificity a owl:ObjectProperty) - (gnt:hasSpecificity rdfs:domain gnc:Probeset) - (gnt:hasBlatScore a owl:ObjectProperty) - (gnt:hasBlatScore rdfs:domain gnc:Probeset) - (gnt:hasBlatMbStart a owl:ObjectProperty) - (gnt:hasBlatMbStart rdfs:domain gnc:Probeset) - (gnt:hasBlatMbEnd a owl:ObjectProperty) - (gnt:hasBlatMbEnd rdfs:domain gnc:Probeset) - (gnt:hasBlatSeq a owl:ObjectProperty) - (gnt:hasBlatSeq rdfs:domain gnc:Probeset) - (gnt:hasTargetSeq a owl:ObjectProperty) - (gnt:hasTargetSeq rdfs:domain gnc:Probeset)) - (triples - (let ((id (field ("IF(NULLIF(TRIM(ProbeSet.Name), '') IS NULL, '', TRIM(ProbeSet.Name))" - ProbeSetIdName))) - (probeset-id (field ProbeSet Id))) - (string->identifier - "probeset" - (if (string-null? id) - (number->string probeset-id) - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - id - 'pre "_" 'post)))) - (set rdf:type 'gnc:Probeset) - (set rdfs:label (field ProbeSet Name)) - (set skos:altLabel - (replace-substrings - (field ProbeSet alias) - '(("\r\n" . "; ")))) - (set gnt:hasChip - (string->identifier - "platform" - (field ("IFNULL(GeneChip.Name, '')" GeneChipName)))) - (set gnt:hasTargetId - (field ("NULLIF(TRIM(ProbeSet.TargetId), '')" - TargetId))) - (set gnt:geneSymbol - (field ProbeSet Symbol)) - (set dct:description (sanitize-rdf-string (field ProbeSet description))) - (set gnt:targetsRegion - (sanitize-rdf-string - (field ("NULLIF(TRIM(ProbeSet.Probe_set_target_region), '')" - Probe_set_target_region)))) - (set gnt:chr (field ProbeSet Chr)) - (set gnt:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) - (set gnt:location - (let* ((mb (field ProbeSet Mb)) - (chr (field ProbeSet Chr)) - (strand-probe (field ProbeSet Strand_Probe)) - (location (list chr mb))) - (match location - (("Un" mb) - (format #f "Not available")) - ((chr "") - (if (string-blank? chr) - (format #f "Not available") - (format #f "Chr ~a @ Unknown position ~a~:[~;~a~]" - chr mb - (and (string? strand-probe) (or (string=? "+" strand-probe) - (string=? "-" strand-probe))) - (cond ((string=? "+" strand-probe) - "on the plus strand") - ((string=? "-" strand-probe) - "on the minus strand") - (else ""))))) - (_ - (format #f "Chr ~a @ ~a Mb ~:[~;~a~]" - chr mb - (and (string? strand-probe) (or (string=? "+" strand-probe) - (string=? "-" strand-probe))) - (cond ((string=? "+" strand-probe) - "on the plus strand") - ((string=? "-" strand-probe) - "on the minus strand") - (else ""))))))) - (set gnt:hasGeneId - (ontology 'gene: - (string-trim-both (field ProbeSet GeneId)))) - ;; OMIM Link - (set dct:references - (let ((omim (field ProbeSet OMIM))) - (if (not (string-blank? omim)) - (string->symbol - (format #f - "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" - "http://www.ncbi.nlm.nih.gov/omim/" - (uri-encode omim) - "a gnc:omimLink")) - ""))) - ;; Homologene Link - (set dct:references - (let ((homologene (field ProbeSet HomoloGeneID))) - (if (not (string-blank? homologene)) - (string->symbol - (format #f - "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" - "http://www.ncbi.nlm.nih.gov/homologene/?term=" - (uri-encode homologene) - "a gnc:homologeneLink")) - ""))) - (set gnt:uniprot - (ontology 'uniprot: (field ProbeSet UniProtID))) - (set gnt:strandProbe - (field ProbeSet Strand_Probe)) - (set gnt:hasSpecificity - (field ("IFNULL(ProbeSet.Probe_set_specificity, '')" - Probe_set_specificity))) - (set gnt:hasBlatScore - (field ("IFNULL(ProbeSet.Probe_set_BLAT_score, '')" - Probe_set_BLAT_score))) - (set gnt:hasBlatMbStart - (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start, '')" - Probe_set_Blat_Mb_start)) - '^^xsd:double)) - (set gnt:hasBlatMbEnd - (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_end, '')" - Probe_set_Blat_Mb_end)) - '^^xsd:double)) - (set gnt:hasBlatSeq (sanitize-rdf-string (field ProbeSet BlatSeq))) - (set gnt:hasTargetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))))) - - - - -(let* ((option-spec - '((settings (single-char #\s) (value #t)) - (output (single-char #\o) (value #t)) - (documentation (single-char #\d) (value #t)))) - (options (getopt-long (command-line) option-spec)) - (settings (option-ref options 'settings #f)) - (output (option-ref options 'output #f)) - (documentation (option-ref options 'documentation #f)) - (%connection-settings - (call-with-input-file settings - read))) - (with-documentation - (name "ProbeSet Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("gn:" "<http://genenetwork.org/id/>") - ("probeset:" "<http://genenetwork.org/probeset/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gene:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>") - ("gnt:" "<http://genenetwork.org/term/>") - ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") - ("dct:" "<http://purl.org/dc/terms/>") - ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") - ("qb:" "<http://purl.org/linked-data/cube#>") - ("sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") - ("skos:" "<http://www.w3.org/2004/02/skos/core#>"))) - (inputs - (list probeset)) - (outputs - `(#:documentation ,documentation - #:rdf ,output)))) diff --git a/examples/publication.scm b/examples/publication.scm index eab4da7..6b57856 100755 --- a/examples/publication.scm +++ b/examples/publication.scm @@ -70,12 +70,12 @@ (connection %connection-settings) (table-metadata? #f) (prefixes - '(("gnt:" "<http://genenetwork.org/term/>") + '(("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("fabio:" "<http://purl.org/spar/fabio/>") ("dct:" "<http://purl.org/dc/terms/>") ("prism:" "<http://prismstandard.org/namespaces/basic/2.0/>") - ("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") diff --git a/examples/schema.scm b/examples/schema.scm new file mode 100755 index 0000000..4bde895 --- /dev/null +++ b/examples/schema.scm @@ -0,0 +1,70 @@ +#! /usr/bin/env guile +!# + +(use-modules (ice-9 getopt-long) + (transform triples) + (transform schema) + (transform special-forms) + (transform sql) + (transform table)) + +(define (call-with-genenetwork-database connection-settings proc) + (call-with-database "mysql" (string-join + (list (assq-ref connection-settings 'sql-username) + (assq-ref connection-settings 'sql-password) + (assq-ref connection-settings 'sql-database) + "tcp" + (assq-ref connection-settings 'sql-host) + (number->string + (assq-ref connection-settings 'sql-port))) + ":") + proc)) + +(define (transform-table-schema connection-settings db) + (let ((tables (tables connection-settings db))) + (for-each (lambda (table) + (let ((table-id (string->identifier + "table" + ;; We downcase table names in + ;; identifiers. So, we distinguish + ;; between the user and User tables. + (if (string=? (table-name table) "User") + "user2" + (table-name table))))) + (triple table-id 'rdf:type 'gn:sqlTable) + (triple table-id 'gn:name (table-name table)) + (triple table-id 'gn:has_size (string->symbol (format #f "~a" (table-size table)))) + (for-each (lambda (column) + (let ((column-id (column-id (table-name table) + (column-name column)))) + (triple column-id 'rdf:type 'gn:sql_table_field) + (triple column-id 'gn:name (column-name column)) + (triple column-id 'gn:sql_field_type (column-type column)) + (triple table-id 'gn:has_field column-id))) + (table-columns table)))) + tables))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings (call-with-input-file settings read))) + (call-with-genenetwork-database + %connection-settings + (lambda (db) + (with-output-to-file output + (lambda () + (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + (prefix "gn:" "<http://rdf.genenetwork.org/v1/id/>") + (prefix "gnc:" "<http://rdf.genenetwork.org/v1/category/>") + (prefix "gnt:" "<http://rdf.genenetwork.org/v1/term/>") + (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>") + (prefix "owl:" "<http://www.w3.org/2002/07/owl#>") + (newline) + (transform-table-schema %connection-settings db)))))) diff --git a/examples/strains.scm b/examples/strains.scm index 2e1e24f..cb4978f 100755 --- a/examples/strains.scm +++ b/examples/strains.scm @@ -11,15 +11,6 @@ (transform triples) (transform special-forms)) -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) #! @@ -69,33 +60,28 @@ At this point it is not very clear how Name, Name2, Symbol and Alias are used. (schema-triples (gnt:alias rdfs:domain gnc:strain) (gnt:alias a owl:ObjectProperty) - (gnt:geneSymbol rdfs:domain gnc:strain) - (gnt:geneSymbol a owl:ObjectProperty)) + (gnt:gene_symbol rdfs:domain gnc:strain) + (gnt:gene_symbol a owl:ObjectProperty)) (triples (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field Strain Name) - 'pre "_" 'post)) + "strain" + (field Strain Name) + #:separator "_") (set rdf:type 'gnc:strain) - (set gnt:belongsToSpecies - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) + (set gnt:belongs_to_species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) ;; Name, and maybe a second name (set rdfs:label (sanitize-rdf-string (field Strain Name))) (set skos:altLabel (sanitize-rdf-string (field ("IF ((Strain.Name2 != Strain.Name), Strain.Name2, '')" Name2)))) (set gnt:alias (sanitize-rdf-string (field ("IF ((Strain.Alias != Strain.Name), Strain.Alias, '')" Alias)))) - (set gnt:geneSymbol (field Strain Symbol)))) + (set gnt:gene_symbol (field Strain Symbol)))) (define-transformer mapping-method (tables (MappingMethod)) (schema-triples - (gnc:mappingMethod a skos:Concept) - (gnc:mappingMethod skos:definition "Terms that decribe mapping methods used on this resource")) + (gnc:mapping_method a skos:Concept) + (gnc:mapping_method skos:definition "Terms that decribe mapping methods used on this resource")) (triples - (string->identifier "mappingMethod" (field MappingMethod Name)) - (set rdf:type 'gnc:mappingMethod) + (string->identifier "mapping_method" (field MappingMethod Name) #:separator "_") + (set rdf:type 'gnc:mapping_method) (set rdfs:label (field MappingMethod Name)))) (define-transformer avg-method @@ -103,10 +89,10 @@ At this point it is not very clear how Name, Name2, Symbol and Alias are used. ;; the Name field. (tables (AvgMethod)) (schema-triples - (gnc:avgMethod a skos:Concept) - (gnc:avgMethod skos:definition "Terms that decribe normalization methods used on this resource")) - (triples (string->identifier "avgMethod" (field AvgMethod Name AvgMethodName)) - (set rdf:type 'gnc:avgMethod) + (gnc:avg_method a skos:Concept) + (gnc:avg_method skos:definition "Terms that decribe normalization methods used on this resource")) + (triples (string->identifier "avg_method" (field AvgMethod Name AvgMethodName) #:separator "_") + (set rdf:type 'gnc:avg_method) (set rdfs:label (field AvgMethod Normalization)))) @@ -124,14 +110,14 @@ At this point it is not very clear how Name, Name2, Symbol and Alias are used. read))) (with-documentation - (name "Species Metadata") + (name "Strain Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes - '(("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") + '(("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("gnt:" "<http://genenetwork.org/term/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") diff --git a/examples/tissue.scm b/examples/tissue.scm index 2659b66..82cc226 100755 --- a/examples/tissue.scm +++ b/examples/tissue.scm @@ -20,7 +20,8 @@ (gnc:tissue a skos:Concept)) ;; Hopefully the Short_Name field is distinct and can be used as an ;; identifier. - (triples (string->identifier "tissue" (field Tissue Short_Name)) + (triples (string->identifier "tissue" (field Tissue Short_Name) + #:separator "_") (set rdf:type 'gnc:tissue) (set rdfs:label (field Tissue Name)))) @@ -42,10 +43,10 @@ (connection %connection-settings) (table-metadata? #f) (prefixes - '(("gn:" "<http://genenetwork.org/id/>") - ("gnt:" "<http://genenetwork.org/term/>") + '(("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") - ("gnc:" "<http://genenetwork.org/category/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>"))) (inputs diff --git a/generate-ttl-files.scm b/generate-ttl-files.scm new file mode 100755 index 0000000..28be496 --- /dev/null +++ b/generate-ttl-files.scm @@ -0,0 +1,127 @@ +#! /usr/bin/env guile +!# + +(use-modules (ice-9 format) + (ice-9 getopt-long) + (ice-9 ftw) + (ice-9 regex) + (srfi srfi-26) + (srfi srfi-34) + (srfi srfi-35)) + + +;; Copied over from GNU/Guix source tree. +(define (file-name-predicate regexp) + "Return a predicate that returns true when passed a file name whose base +name matches REGEXP." + (let ((file-rx (if (regexp? regexp) + regexp + (make-regexp regexp)))) + (lambda (file stat) + (regexp-exec file-rx (basename file))))) + +(define* (find-files dir #:optional (pred (const #t)) + #:key (stat lstat) + directories? + fail-on-error?) + "Return the lexicographically sorted list of files under DIR for which PRED +returns true. PRED is passed two arguments: the absolute file name, and its +stat buffer; the default predicate always returns true. PRED can also be a +regular expression, in which case it is equivalent to (file-name-predicate +PRED). STAT is used to obtain file information; using 'lstat' means that +symlinks are not followed. If DIRECTORIES? is true, then directories will +also be included. If FAIL-ON-ERROR? is true, raise an exception upon error." + (let ((pred (if (procedure? pred) + pred + (file-name-predicate pred)))) + ;; Sort the result to get deterministic results. + (sort (file-system-fold (const #t) + (lambda (file stat result) ; leaf + (if (pred file stat) + (cons file result) + result)) + (lambda (dir stat result) ; down + (if (and directories? + (pred dir stat)) + (cons dir result) + result)) + (lambda (dir stat result) ; up + result) + (lambda (file stat result) ; skip + result) + (lambda (file stat errno result) + (format (current-error-port) "find-files: ~a: ~a~%" + file (strerror errno)) + (when fail-on-error? + (error "find-files failed")) + result) + '() + dir + stat) + string<?))) + +(define-syntax-rule (warn-on-error expr file) + (catch 'system-error + (lambda () + expr) + (lambda args + (format (current-error-port) + "warning: failed to delete ~a: ~a~%" + file (strerror + (system-error-errno args)))))) + +(define* (delete-file-recursively dir + #:key follow-mounts?) + "Delete DIR recursively, like `rm -rf', without following symlinks. Don't +follow mount points either, unless FOLLOW-MOUNTS? is true. Report but ignore +errors." + (let ((dev (stat:dev (lstat dir)))) + (file-system-fold (lambda (dir stat result) ; enter? + (or follow-mounts? + (= dev (stat:dev stat)))) + (lambda (file stat result) ; leaf + (warn-on-error (delete-file file) file)) + (const #t) ; down + (lambda (dir stat result) ; up + (warn-on-error (rmdir dir) dir)) + (const #t) ; skip + (lambda (file stat errno result) + (format (current-error-port) + "warning: failed to delete ~a: ~a~%" + file (strerror errno))) + #t + dir + + ;; Don't follow symlinks. + lstat))) + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (documentation (single-char #\d) (value #t)) + (output (single-char #\o) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%source-dir (dirname (current-filename)))) + (unless (file-exists? output) + (mkdir output)) + ;; Transform data to RDF + (for-each (lambda (file) + (let* ((base-file-name (basename file ".scm")) + (ttl-file (string-append output "/" base-file-name ".ttl"))) + ;; Ignore dataset-metadata-git.scm because TODO + (unless (string=? base-file-name "dataset-metadata-git") + (system* "guile" "-L" (dirname (current-filename)) file + "--settings" settings "--output" ttl-file)))) + (find-files "./examples" ".scm")) + ;; Copy hand-woven ttl files. + (for-each (lambda (file) + (copy-file + file (format #f "~a/~a" output (basename file)))) + (find-files "./schema" ".ttl")) + ;; Validate transformed turtle files + (for-each (lambda (file) + (system* "rapper" "--input" "turtle" "--count" file)) + (append (find-files output ".ttl") + (find-files "./schema" ".ttl")))) diff --git a/json-dump.scm b/json-to-ttl.scm index 0a054c5..8fc4002 100755 --- a/json-dump.scm +++ b/json-to-ttl.scm @@ -59,7 +59,7 @@ inside it." (with-output-to-file (string-append %directory "/sampledata.ttl") (lambda () - (prefix "gn:" "<http://genenetwork.org/>") + (prefix "gn:" "<http://rdf.genenetwork.org/v1/>") (newline) (run-proc-on-files %data-directory diff --git a/load-rdf.scm b/load-rdf.scm index aaf1b00..db1e1a0 100755 --- a/load-rdf.scm +++ b/load-rdf.scm @@ -16,7 +16,7 @@ (web uri)) (define %graph-uri - "http://genenetwork.org") + "http://rdf.genenetwork.org/v1") (define (call-with-pipe proc mode program . args) "Execute PROGRAM ARGS ... in a subprocess with a pipe of MODE to diff --git a/manifest.scm b/manifest.scm index 63e9bd7..d736e51 100644 --- a/manifest.scm +++ b/manifest.scm @@ -6,8 +6,7 @@ (use-modules (gnu packages autotools) ((gnu packages base) #:select (gnu-make)) - ((gnu packages bioinformatics) #:select (ccwl)) - ((gnu packages databases) #:select (virtuoso-ose mariadb)) + (gnu packages databases) (gnu packages graphviz) (gnu packages guile) ((gnu packages guile-xyz) #:select (guile-sparql) #:prefix guix:) @@ -60,5 +59,6 @@ guile-json-4 guile-dsv ;; We abuse (ccwl graphviz) as a library to visualize the database ;; schema. Hence we need ccwl and guile-libyaml. - ccwl graphviz guile-hashing guile-libyaml guile-sparql + ;; ccwl graphviz + guile-hashing guile-libyaml guile-sparql raptor2 run64 virtuoso-ose mariadb)) diff --git a/schema/species.ttl b/schema/species.ttl index f0d5207..60ebd17 100644 --- a/schema/species.ttl +++ b/schema/species.ttl @@ -3,8 +3,8 @@ @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix wd: <http://www.wikidata.org/entity/> . -@prefix gn: <http://genenetwork.org/id/> . -@prefix gnt: <http://genenetwork.org/term/> . +@prefix gn: <http://rdf.genenetwork.org/v1/id/> . +@prefix gnt: <http://rdf.genenetwork.org/v1/term/> . # sorted on short names: # diff --git a/transform/schema-dump.scm b/transform/schema.scm index 18df5da..f3896a7 100644 --- a/transform/schema-dump.scm +++ b/transform/schema.scm @@ -4,7 +4,13 @@ #:use-module (transform sql) #:use-module (transform triples) #:use-module (transform strings) - #:use-module (transform table)) + #:use-module (transform table) + #:export (table-fields + get-tables-from-comments + schema-annotations + tables + schema + data-table)) (define (table-fields db table) @@ -47,7 +53,7 @@ (for-each (cut table-fields db <>) (get-tables-from-comments db))))) -(define (tables db) +(define (tables connection-settings db) "Return list of all tables in DB. Each element of the returned list is a <table> object." (map (lambda (table) @@ -68,7 +74,7 @@ is a <table> object." (information_schema.tables data_length)) (information_schema.tables) (format #f "WHERE table_schema = '~a'" - (assq-ref %connection-settings 'sql-database)))))) + (assq-ref connection-settings 'sql-database)))))) (define (schema db) (let ((tables (tables db))) @@ -83,14 +89,14 @@ is a <table> object." (table-name table))))) (triple table-id 'rdf:type 'gn:sqlTable) (triple table-id 'gn:name (table-name table)) - (triple table-id 'gn:hasSize (table-size table)) + (triple table-id 'gn:has_size (table-size table)) (for-each (lambda (column) (let ((column-id (column-id (table-name table) (column-name column)))) - (triple column-id 'rdf:type 'gn:sqlTableField) + (triple column-id 'rdf:type 'gn:sql_table_field) (triple column-id 'gn:name (column-name column)) - (triple column-id 'gn:sqlFieldType (column-type column)) - (triple table-id 'gn:hasField column-id))) + (triple column-id 'gn:sql_field_type (column-type column)) + (triple table-id 'gn:has_field column-id))) (table-columns table)))) tables))) diff --git a/transform/special-forms.scm b/transform/special-forms.scm index 99b30df..ddb3180 100644 --- a/transform/special-forms.scm +++ b/transform/special-forms.scm @@ -537,40 +537,42 @@ The above query results to triples that have the form: (call-with-target-database connection (lambda (db) - (with-output-to-file ; - doc-path - (lambda () - (format #t "# ~a" name) - (for-each - (lambda (proc) - (proc db - #:metadata? #f - #:data? #f - #:documentation? - (lambda () (for-each - (match-lambda - ((k v) - (begin - (prefix k v #f)))) - prefixes)))) - inputs)) - #:encoding "UTF-8") + (when doc-path + (with-output-to-file ; + doc-path + (lambda () + (format #t "# ~a" name) + (for-each + (lambda (proc) + (proc db + #:metadata? #f + #:data? #f + #:documentation? + (lambda () (for-each + (match-lambda + ((k v) + (begin + (prefix k v #f)))) + prefixes)))) + inputs)) + #:encoding "UTF-8")) ;; Dumping the actual data - (with-output-to-file - rdf-path - (lambda () - ;; Add the prefixes - (for-each - (match-lambda - ((k v) - (begin - (prefix k v)))) - prefixes) - (newline) - (for-each - (lambda (proc) - (proc db #:metadata? table-metadata?)) - inputs)) - #:encoding "UTF-8"))))))) + (when rdf-path + (with-output-to-file + rdf-path + (lambda () + ;; Add the prefixes + (for-each + (match-lambda + ((k v) + (begin + (prefix k v)))) + prefixes) + (newline) + (for-each + (lambda (proc) + (proc db #:metadata? table-metadata?)) + inputs)) + #:encoding "UTF-8")))))))) diff --git a/transform/sql.scm b/transform/sql.scm index a8962c8..daedf97 100644 --- a/transform/sql.scm +++ b/transform/sql.scm @@ -102,13 +102,14 @@ (dbi-get_row db)) (define (call-with-target-database connection-settings proc) - (call-with-database "mysql" (string-join - (list (assq-ref connection-settings 'sql-username) - (assq-ref connection-settings 'sql-password) - (assq-ref connection-settings 'sql-database) - "tcp" - (assq-ref connection-settings 'sql-host) - (number->string - (assq-ref connection-settings 'sql-port))) - ":") + (call-with-database "mysql" (string-append (string-join + (list (assq-ref connection-settings 'sql-username) + (assq-ref connection-settings 'sql-password) + (assq-ref connection-settings 'sql-database) + "tcp" + (assq-ref connection-settings 'sql-host) + (number->string + (assq-ref connection-settings 'sql-port))) + ":") + "?charset=utf8") proc)) diff --git a/transform/strings.scm b/transform/strings.scm index 7545f62..aba554a 100644 --- a/transform/strings.scm +++ b/transform/strings.scm @@ -11,11 +11,11 @@ delete-substrings replace-substrings remove-duplicates - remap-species-identifiers str sanitize-rdf-string snake->lower-camel lower-case-and-replace-spaces - string-capitalize-first)) + string-capitalize-first + normalize-string-field)) (define (lower-case-and-replace-spaces str) (string-map @@ -121,13 +121,12 @@ association list mapping substrings to their replacements." ((memq (car lst) result) (loop (cdr lst) result)) (else (loop (cdr lst) (cons (car lst) result)))))) - -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) +(define (normalize-string-field field) + (let ((field (string-trim-both field))) + (match field + ((? string? field) + (if (or (string-blank? field) + (string=? (string-downcase field) "none")) + "" + field)) + (_ "")))) diff --git a/transform/triples.scm b/transform/triples.scm index 9775d36..13758e5 100644 --- a/transform/triples.scm +++ b/transform/triples.scm @@ -8,8 +8,19 @@ triple scm->triples annotate-field + remap-species-identifiers string->binomial-name)) +(define (remap-species-identifiers str) + "This procedure remaps identifiers to standard binominal. Obviously this should + be sorted by correcting the database!" + (match str + ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] + ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] + ["Macaca mulatta" "Macaca nemestrina"] + ["Bat (Glossophaga soricina)" "Glossophaga soricina"] + [str str])) + (define (annotate-field field schema) (let ([schema (cond ((symbol? schema) (symbol->string schema)) @@ -28,7 +39,7 @@ #:optional #:key (ontology "gn:") (separator "") - (proc string-capitalize-first)) + (proc (lambda (x) x))) "Convert STR to a turtle identifier after replacing illegal characters with an underscore and prefixing with gn:PREFIX." (if (or (and (string? str) (string-null? str)) @@ -40,11 +51,12 @@ characters with an underscore and prefixing with gn:PREFIX." (lambda (c) (eq? c #\))) (string-map (lambda (c) - (case c - ((#\/ #\< #\> #\+ #\( #\space #\@) #\_) - (else c))) - (proc - (string-trim-right str #\.)))))))) + (if (or (char-alphabetic? c) + (char-numeric? c) + (char=? c #\_)) + c + #\_)) + (proc str))))))) (define* (prefix prefix iri #:optional (ttl? #t)) diff --git a/transform/uuid.scm b/transform/uuid.scm deleted file mode 100644 index be0e592..0000000 --- a/transform/uuid.scm +++ /dev/null @@ -1,234 +0,0 @@ -;; CREDIT: https://lists.gnu.org/archive/html/guile-user/2018-01/msg00019.html -(define-module (transform uuid) - #:use-module (srfi srfi-1) - #:use-module (srfi srfi-11) - #:use-module (rnrs bytevectors) - #:use-module (ice-9 iconv) - #:export (bytevector->md5 - make-version-3-uuid)) - -(define (bytevector->md5 bytevector) - "Convert BYTEVECTOR to a bytevector containing the MD5 hash of -BYTEVECTOR." - ;; Implemented along RFC 1321. It should be easy to verify that - ;; this procedure performs the operations specified therein. - (define (append-padding-bits bytevector) - "Makes a list from BYTEVECTOR with padding as per RFC 1321 3.1." - (let* ((length-in-bits (* 8 (bytevector-length bytevector))) - (padding-bits (- 512 (modulo (- length-in-bits 448) 512)))) - (append (bytevector->u8-list bytevector) - '(128) ; #*10000000 - (iota - (- (/ padding-bits 8) 1) - 0 0)))) - (define (append-length msg-list message-length) - "Append MESSAGE-LENGTH as 8 byte values from a uint64 to MSG-LIST." - (append msg-list - ;; For numbers too large for an uint64, only the low-order - ;; bytes are returned. - (bytevector->u8-list (u64vector - (modulo - (* message-length 8) ; bits - (1+ #xffffffffffffffff)))))) - (let hash ((AA #x67452301) - (BB #xefcdab89) - (CC #x98badcfe) - (DD #x10325476) - (to-digest - (append-length - (append-padding-bits - bytevector) - (bytevector-length bytevector)))) - (define (F X Y Z) - (logior (logand X Y) (logand (lognot X) Z))) - (define (G X Y Z) - (logior (logand X Z) (logand Y (lognot Z)))) - (define (H X Y Z) - (logxor X Y Z)) - (define (I X Y Z) - (logxor Y (logior X (lognot Z)))) - (define (T i) - (inexact->exact (floor (* 4294967296 (abs (sin i)))))) - (define (number->u32 n) - "Cut off all bits that do not fit in a uint32." - (bit-extract n 0 32)) - (define (lsh32 n count) - (number->u32 (logior (ash n count) - (bit-extract n (- 32 count) 32)))) - (if (not (null? to-digest)) - (let* ((block (u8-list->bytevector - (list-head to-digest (/ 512 8)))) - (X (lambda (j) (bytevector-u32-ref - block (* 4 j) (endianness little)))) - (do-round1 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (F b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 0 7 1)) - (D (operation D A B C 1 12 2)) - (C (operation C D A B 2 17 3)) - (B (operation B C D A 3 22 4)) - (A (operation A B C D 4 7 5)) - (D (operation D A B C 5 12 6)) - (C (operation C D A B 6 17 7)) - (B (operation B C D A 7 22 8)) - (A (operation A B C D 8 7 9)) - (D (operation D A B C 9 12 10)) - (C (operation C D A B 10 17 11)) - (B (operation B C D A 11 22 12)) - (A (operation A B C D 12 7 13)) - (D (operation D A B C 13 12 14)) - (C (operation C D A B 14 17 15)) - (B (operation B C D A 15 22 16))) - (values A B C D)))) - (do-round2 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (G b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 1 5 17)) - (D (operation D A B C 6 9 18)) - (C (operation C D A B 11 14 19)) - (B (operation B C D A 0 20 20)) - (A (operation A B C D 5 5 21)) - (D (operation D A B C 10 9 22)) - (C (operation C D A B 15 14 23)) - (B (operation B C D A 4 20 24)) - (A (operation A B C D 9 5 25)) - (D (operation D A B C 14 9 26)) - (C (operation C D A B 3 14 27)) - (B (operation B C D A 8 20 28)) - (A (operation A B C D 13 5 29)) - (D (operation D A B C 2 9 30)) - (C (operation C D A B 7 14 31)) - (B (operation B C D A 12 20 32))) - (values A B C D)))) - (do-round3 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (H b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 5 4 33)) - (D (operation D A B C 8 11 34)) - (C (operation C D A B 11 16 35)) - (B (operation B C D A 14 23 36)) - (A (operation A B C D 1 4 37)) - (D (operation D A B C 4 11 38)) - (C (operation C D A B 7 16 39)) - (B (operation B C D A 10 23 40)) - (A (operation A B C D 13 4 41)) - (D (operation D A B C 0 11 42)) - (C (operation C D A B 3 16 43)) - (B (operation B C D A 6 23 44)) - (A (operation A B C D 9 4 45)) - (D (operation D A B C 12 11 46)) - (C (operation C D A B 15 16 47)) - (B (operation B C D A 2 23 48))) - (values A B C D)))) - (do-round4 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (I b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 0 6 49)) - (D (operation D A B C 7 10 50)) - (C (operation C D A B 14 15 51)) - (B (operation B C D A 5 21 52)) - (A (operation A B C D 12 6 53)) - (D (operation D A B C 3 10 54)) - (C (operation C D A B 10 15 55)) - (B (operation B C D A 1 21 56)) - (A (operation A B C D 8 6 57)) - (D (operation D A B C 15 10 58)) - (C (operation C D A B 6 15 59)) - (B (operation B C D A 13 21 60)) - (A (operation A B C D 4 6 61)) - (D (operation D A B C 11 10 62)) - (C (operation C D A B 2 15 63)) - (B (operation B C D A 9 21 64))) - (values A B C D))))) - (let*-values (((A B C D) (values AA BB CC DD)) - ((A B C D) (do-round1 A B C D)) - ((A B C D) (do-round2 A B C D)) - ((A B C D) (do-round3 A B C D)) - ((A B C D) (do-round4 A B C D))) - (hash (number->u32 (+ A AA)) - (number->u32 (+ B BB)) - (number->u32 (+ C CC)) - (number->u32 (+ D DD)) - (list-tail to-digest (/ 512 8))))) - ;; we’re done: - (u8-list->bytevector - (append - (bytevector->u8-list (u32vector AA)) - (bytevector->u8-list (u32vector BB)) - (bytevector->u8-list (u32vector CC)) - (bytevector->u8-list (u32vector DD))))))) - -(define* (make-version-3-uuid namespace-uuid str #:optional (prefix "urn:uuid:")) - "Generates a UUID string by computing the MD5 hash of NAMESPACE-UUID -and STR. NAMESPACE-UUID must be a bytevector consisting of the UUID’s -bytes, *not* the UUID’s string representation." - (define (half-byte->hex-char number) - "Returns the corresponding hexadecimal digit for a number NUMBER -between 0 and 15." - (case number - ((0) #\0) - ((1) #\1) - ((2) #\2) - ((3) #\3) - ((4) #\4) - ((5) #\5) - ((6) #\6) - ((7) #\7) - ((8) #\8) - ((9) #\9) - ((10) #\a) - ((11) #\b) - ((12) #\c) - ((13) #\d) - ((14) #\e) - ((15) #\f))) - (define (byte->hex-string bv index) - "Convert the byte at INDEX of bytevector BV to a hex string." - (let ((byte (bytevector-u8-ref bv index))) - (string (half-byte->hex-char (quotient byte 16)) - (half-byte->hex-char (modulo byte 16))))) - (let ((md5 (bytevector->md5 - (u8-list->bytevector - (append (bytevector->u8-list namespace-uuid) - (bytevector->u8-list (string->utf8 str))))))) - (string-append prefix - ;; time_low field: - (byte->hex-string md5 0) - (byte->hex-string md5 1) - (byte->hex-string md5 2) - (byte->hex-string md5 3) - "-" - ;; time_mid field: - (byte->hex-string md5 4) - (byte->hex-string md5 5) - "-" - ;; time_hi_and_version field: - (let ((byte (bytevector-u8-ref md5 6))) - (string (half-byte->hex-char 3) ; UUID version 3 - (half-byte->hex-char (modulo byte 16)))) - (byte->hex-string md5 7) - "-" - ;; clock_seq_hi_and_reserved field: - (let ((byte (bytevector-u8-ref md5 8))) - (string (half-byte->hex-char - (logior #b1000 ; most significant bits are 10 - (bit-extract (quotient byte 16) 0 2))) - (half-byte->hex-char (modulo byte 16)))) - ;; clock_seq_low field: - (byte->hex-string md5 9) - "-" - ;; node field: - (byte->hex-string md5 10) - (byte->hex-string md5 11) - (byte->hex-string md5 12) - (byte->hex-string md5 13) - (byte->hex-string md5 14) - (byte->hex-string md5 15)))) diff --git a/visualize-schema.scm b/visualize-schema.scm index 92f9272..13448cc 100755 --- a/visualize-schema.scm +++ b/visualize-schema.scm @@ -22,7 +22,7 @@ (prefix "http://www.w3.org/1999/02/22-rdf-syntax-ns#")) (define gn - (prefix "http://genenetwork.org/")) + (prefix "http://rdf.genenetwork.org/v1/")) (define graph (@@ (ccwl graphviz) graph)) (define graph-node (@@ (ccwl graphviz) graph-node)) @@ -149,17 +149,17 @@ is a <table> object." (map (cut string=? <> "1") (string-split field-transformed #\,)))))) (sparql-query-records - "PREFIX gn: <http://genenetwork.org/> -SELECT SAMPLE(?tablename) SAMPLE(?size) GROUP_CONCAT(?fieldname ; separator=\",\") GROUP_CONCAT(?fieldtype ; separator=\",\") GROUP_CONCAT(EXISTS{ ?transform rdf:type gn:transform . ?transform gn:dependsOn ?field .} ; separator=\",\") + "PREFIX gn: <http://rdf.genenetwork.org/v1/> +SELECT SAMPLE(?tablename) SAMPLE(?size) GROUP_CONCAT(?fieldname ; separator=\",\") GROUP_CONCAT(?fieldtype ; separator=\",\") GROUP_CONCAT(EXISTS{ ?transform rdf:type gn:transform . ?transform gn:depends_on ?field .} ; separator=\",\") WHERE { ?table rdf:type gn:sqlTable ; gn:name ?tablename ; - gn:hasSize ?size ; - gn:hasField ?field . - ?field rdf:type gn:sqlTableField ; + gn:has_size ?size ; + gn:has_field ?field . + ?field rdf:type gn:sql_table_field ; gn:name ?fieldname ; - gn:sqlFieldType ?fieldtype . + gn:sql_field_type ?fieldtype . } GROUP BY ?table"))) (define (foreign-key-graphviz-edges tables) @@ -233,20 +233,20 @@ properties." (string-split fields #\,)) ", ")))))) (sparql-query-records - "PREFIX gn: <http://genenetwork.org/> + "PREFIX gn: <http://rdf.genenetwork.org/v1/> SELECT ?type ?predicate GROUP_CONCAT(?tablename ; separator=\",\") GROUP_CONCAT(?fieldname ; separator=\",\") WHERE { ?predicate rdfs:domain ?type ; rdfs:range rdfs:Literal . ?transform rdf:type gn:transform ; - gn:createsPredicate ?predicate ; - gn:forSubjectType ?type ; - gn:dependsOn ?field . - ?field rdf:type gn:sqlTableField ; + gn:creates_predicate ?predicate ; + gn:for_subject_type ?type ; + gn:depends_on ?field . + ?field rdf:type gn:sql_table_field ; gn:name ?fieldname . ?table rdf:type gn:sqlTable ; - gn:hasField ?field ; + gn:has_field ?field ; gn:name ?tablename . } GROUP BY ?type ?predicate "))) |
