diff options
Diffstat (limited to 'examples')
-rwxr-xr-x | examples/dump-dataset-metadata.scm | 182 |
1 files changed, 92 insertions, 90 deletions
diff --git a/examples/dump-dataset-metadata.scm b/examples/dump-dataset-metadata.scm index 53c381c..c51364a 100755 --- a/examples/dump-dataset-metadata.scm +++ b/examples/dump-dataset-metadata.scm @@ -52,11 +52,11 @@ (foaf:givenName rdfs:range rdfs:Literal) (foaf:familyName rdfs:range rdfs:Literal) (foaf:homepage rdfs:range rdfs:Literal) - (gn:address rdfs:range rdfs:Literal) - (gn:city rdfs:range rdfs:Literal) - (gn:state rdfs:range rdfs:Literal) - (gn:zipCode rdfs:range rdfs:Literal) - (gn:country rdfs:range rdfs:Literal)) + (gn-term:address rdfs:range rdfs:Literal) + (gn-term:city rdfs:range rdfs:Literal) + (gn-term:state rdfs:range rdfs:Literal) + (gn-term:zipCode rdfs:range rdfs:Literal) + (gn-term:country rdfs:range rdfs:Literal)) (triples (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) (field Investigators Email)) @@ -70,11 +70,11 @@ (set foaf:familyName (field ("CAST(CONVERT(BINARY CONVERT(LastName USING latin1) USING utf8) AS VARCHAR(100))" LastName))) (set foaf:homepage (field Investigators Url)) - (set gn:address (field Investigators Address)) - (set gn:city (field Investigators City)) - (set gn:state (field Investigators State)) - (set gn:zipCode (field Investigators ZipCode)) - (set gn:country (field Investigators Country)))) + (set gn-term:address (field Investigators Address)) + (set gn-term:city (field Investigators City)) + (set gn-term:state (field Investigators State)) + (set gn-term:zipCode (field Investigators ZipCode)) + (set gn-term:country (field Investigators Country)))) (define-dump dump-info-files (tables (InfoFiles @@ -92,123 +92,125 @@ (left-join GeneChip "USING (GeneChipId)")) "WHERE GN_AccesionId IS NOT NULL") (schema-triples - (gn:dataset rdfs:range rdfs:Literal) - (gn:datasetOfInvestigator rdfs:domain gn:dataset) - (gn:datasetOfOrganization rdfs:domain gn:dataset) - (gn:datasetOfInvestigator rdfs:range foaf:Person) - (gn:datasetOfInbredSet rdfs:domain gn:dataset) - (gn:datasetOfInbredSet rdfs:range gn:inbredSet) - (gn:datasetOfSpecies rdfs:domain gn:dataset) - (gn:datasetOfSpecies rdfs:range gn:inbredSet) - (gn:datasetOfTissue rdfs:domain gn:dataset) - (gn:datasetOfTissue rdfs:range gn:tissue) - (gn:normalization rdfs:domain gn:dataset) - (gn:normalization rdfs:range gn:avgMethod) - (gn:datasetOfPlatform rdfs:domain gn:dataset) - (gn:datasetOfPlatform rdfs:range gn:geneChip) - (gn:accessionId rdfs:range rdfs:Literal) - (gn:datasetStatusName rdfs:range rdfs:Literal) - (gn:summary rdfs:range rdfs:Literal) - (gn:aboutTissue rdfs:range rdfs:Literal) - (gn:geoSeries rdfs:range rdfs:Literal) - (gn:name rdfs:range rdfs:Literal) - (gn:title rdfs:range rdfs:Literal) - (gn:publicationTitle rdfs:range rdfs:Literal) - (gn:specifics rdfs:range rdfs:Literal) - (gn:datasetGroup rdfs:range rdfs:Literal) - (gn:aboutCases rdfs:range rdfs:Literal) - (gn:aboutPlatform rdfs:range rdfs:Literal) - (gn:aboutDataProcessing rdfs:range rdfs:Literal) - (gn:notes rdfs:range rdfs:Literal) - (gn:experimentDesign rdfs:range rdfs:Literal) - (gn:contributors rdfs:range rdfs:Literal) - (gn:citation rdfs:range rdfs:Literal) - (gn:acknowledgment rdfs:range rdfs:Literal)) - (triples (ontology 'dataset: - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)) + (gn-term:dataset rdfs:range rdfs:Literal) + (gn-term:datasetOfInvestigator rdfs:domain gn:dataset) + (gn-term:datasetOfOrganization rdfs:domain gn:dataset) + (gn-term:datasetOfInvestigator rdfs:range foaf:Person) + (gn-term:datasetOfInbredSet rdfs:domain gn:dataset) + (gn-term:datasetOfInbredSet rdfs:range gn:inbredSet) + (gn-term:datasetOfSpecies rdfs:domain gn:dataset) + (gn-term:datasetOfSpecies rdfs:range gn:inbredSet) + (gn-term:datasetOfTissue rdfs:domain gn:dataset) + (gn-term:datasetOfTissue rdfs:range gn:tissue) + (gn-term:normalization rdfs:domain gn:dataset) + (gn-term:normalization rdfs:range gn:avgMethod) + (gn-term:datasetOfPlatform rdfs:domain gn:dataset) + (gn-term:datasetOfPlatform rdfs:range gn:geneChip) + (gn-term:accessionId rdfs:range rdfs:Literal) + (gn-term:datasetStatusName rdfs:range rdfs:Literal) + (gn-term:summary rdfs:range rdfs:Literal) + (gn-term:aboutTissue rdfs:range rdfs:Literal) + (gn-term:geoSeries rdfs:range rdfs:Literal) + (gn-term:name rdfs:range rdfs:Literal) + (gn-term:title rdfs:range rdfs:Literal) + (gn-term:publicationTitle rdfs:range rdfs:Literal) + (gn-term:specifics rdfs:range rdfs:Literal) + (gn-term:datasetGroup rdfs:range rdfs:Literal) + (gn-term:aboutCases rdfs:range rdfs:Literal) + (gn-term:aboutPlatform rdfs:range rdfs:Literal) + (gn-term:aboutDataProcessing rdfs:range rdfs:Literal) + (gn-term:notes rdfs:range rdfs:Literal) + (gn-term:experimentDesign rdfs:range rdfs:Literal) + (gn-term:contributors rdfs:range rdfs:Literal) + (gn-term:citation rdfs:range rdfs:Literal) + (gn-term:acknowledgment rdfs:range rdfs:Literal)) + (triples (string->identifier + "" (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) (set rdf:type (string->symbol (field ("IF(GenoFreeze.Id IS NOT NULL, 'gn:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gn:phenotypeDataset', 'gn:dataset'))" rdfType)))) - (set gn:name (regexp-substitute/global - #f "^[Nn]one$" - (field InfoFiles InfoPageName) - "")) - (set gn:fullName + (set gn-term:name (regexp-substitute/global + #f "^[Nn]one$" + (field InfoFiles InfoPageName) + "")) + (set gn-term:fullName (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" DatasetFullName))) (set dct:created (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" createTimeGenoFreeze))) - (set gn:datasetOfInvestigator + (set gn-term:datasetOfInvestigator (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) (field Investigators Email))) - (set gn:datasetOfOrganization + (set gn-term:datasetOfOrganization (field ("CAST(CONVERT(BINARY CONVERT(Organizations.OrganizationName USING latin1) USING utf8) AS VARCHAR(1500))" Organizations))) - (set gn:accessionId (format #f "GN~a" (field InfoFiles GN_AccesionId))) - (set gn:datasetStatusName (string-downcase - (field DatasetStatus DatasetStatusName))) - (set gn:datasetOfInbredSet + (set gn-term:accessionId (format #f "GN~a" (field InfoFiles GN_AccesionId))) + (set gn-term:datasetStatusName (string-downcase + (field DatasetStatus DatasetStatusName))) + (set gn-term:datasetOfInbredSet (string->identifier "inbredSet" (field InbredSet Name InbredSetName))) - (set gn:datasetOfTissue (string->identifier "tissue" - (field Tissue Short_Name))) - (set gn:normalization + (set gn-term:datasetOfTissue (string->identifier "tissue" + (field Tissue Short_Name))) + (set gn-term:normalization (string->identifier "avgmethod" ;; If AvgMethodName is NULL, assume N/A. (if (string-blank? (field AvgMethod Name AvgMethodName)) "N/A" (field AvgMethod Name AvgMethodName)))) - (set gn:datasetOfPlatform + (set gn-term:datasetOfPlatform (string->identifier "platform" (field GeneChip Name GeneChip))) - (set gn:summary + (set gn-term:summary (sanitize-rdf-string (field Datasets Summary))) - (set gn:aboutTissue + (set gn-term:aboutTissue (sanitize-rdf-string (field Datasets AboutTissue))) - (set gn:geoSeries + (set gn-term:geoSeries (let ((s (string-match "GSE[0-9]*" (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) (if s (ontology 'geoSeries: (match:substring s)) ""))) - (set gn:title + (set gn-term:title (regexp-substitute/global #f "^[Nn]one$" (field InfoFiles InfoFileTitle) "")) - (set gn:publicationTitle + (set gn-term:publicationTitle (regexp-substitute/global #f "^[Nn]one$" (field Datasets PublicationTitle) "")) - (set gn:specifics (sanitize-rdf-string (field InfoFiles Specifics))) - (set gn:datasetGroup (field Datasets DatasetName DatasetGroup)) - (set gn:aboutCases + (set gn-term:specifics (sanitize-rdf-string (field InfoFiles Specifics))) + (set gn-term:datasetGroup (field Datasets DatasetName DatasetGroup)) + (set gn-term:aboutCases (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutCases USING latin1) USING utf8) AS VARCHAR(10000))" AboutCases)))) - (set gn:aboutPlatform + (set gn-term:aboutPlatform (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutPlatform USING latin1) USING utf8) AS VARCHAR(1500))" AboutPlatform)))) - (set gn:aboutDataProcessing + (set gn-term:aboutDataProcessing (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutDataProcessing USING latin1) USING utf8) AS VARCHAR(1500))" AboutDataProcessing)))) - (set gn:notes + (set gn-term:notes (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.Notes USING latin1) USING utf8) AS VARCHAR(1500))" GNNotes)))) - (set gn:experimentDesign + (set gn-term:experimentDesign (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.ExperimentDesign USING latin1) USING utf8) AS VARCHAR(1500))" ExperimentDesign)))) - (set gn:contributors + (set gn-term:contributors (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.Contributors USING latin1) USING utf8) AS VARCHAR(1500))" Contributors)))) - (set gn:citation + (set gn-term:citation (sanitize-rdf-string (regexp-substitute/global #f "^[Nn]one$" @@ -216,7 +218,7 @@ ("CAST(CONVERT(BINARY CONVERT(Datasets.Citation USING latin1) USING utf8) AS VARCHAR(1500))" Citation)) ""))) - (set gn:dataSourceAcknowledgment + (set gn-term:dataSourceAcknowledgment (sanitize-rdf-string (string-trim-both (regexp-substitute/global @@ -224,8 +226,8 @@ (field ("CAST(CONVERT(BINARY CONVERT(InfoFiles.Data_Source_Acknowledge USING latin1) USING utf8) AS VARCHAR(1500))" Data_Source_Acknowledge)) "")))) - (set gn:acknowledgment (sanitize-rdf-string - (field Datasets Acknowledgment))))) + (set gn-term:acknowledgment (sanitize-rdf-string + (field Datasets Acknowledgment))))) @@ -235,18 +237,18 @@ (connection %connection-settings) (table-metadata? #f) (prefixes - (("dct:" "<http://purl.org/dc/terms/>") - ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") - ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") - ("gn:" "<http://genenetwork.org/terms/>") - ("foaf:" "<http://xmlns.com/foaf/0.1/>") - ("taxon:" "<http://purl.uniprot.org/taxonomy/>") - ("dataset:" "<http://genenetwork.org/dataset/>"))) + '(("foaf:" "<http://xmlns.com/foaf/0.1/>") + ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") + ("gn-term:" "<http://genenetwork.org/term/>") + ("gn:" "<http://genenetwork.org/id/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("taxon:" "<http://purl.uniprot.org/taxonomy/>") + ("dct:" "<http://purl.org/dc/terms/>"))) (inputs - (dump-info-files - dump-investigators)) + (list dump-info-files + dump-investigators)) (outputs - (#:documentation "./docs/dump-info-pages.md" - #:rdf "./verified-data/dump-info-pages.ttl"))) + '(#:documentation "./docs/dump-info-pages.md" + #:rdf "./verified-data/dump-info-pages.ttl"))) |