From 9f458b92aa28ac2d00b9bf822c516d0545a79bc9 Mon Sep 17 00:00:00 2001 From: zsloan Date: Fri, 11 Sep 2020 16:14:02 -0500 Subject: Wrote code and template for reproducing GN1's Info Page (page that displays info about a dataset + download links for its data) * wqflask/wqflask/db_info.py - Python that pulls the info out of the DB and parses the download links from the ipfs response * wqflask/wqflask/templates/info_page.html - template for displaying db info; need to make a lot of changes to its format/aesthetics --- wqflask/wqflask/db_info.py | 127 +++++++++++++++++++++++++++++++ wqflask/wqflask/templates/info_page.html | 92 ++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 wqflask/wqflask/db_info.py create mode 100644 wqflask/wqflask/templates/info_page.html diff --git a/wqflask/wqflask/db_info.py b/wqflask/wqflask/db_info.py new file mode 100644 index 00000000..f04e38bf --- /dev/null +++ b/wqflask/wqflask/db_info.py @@ -0,0 +1,127 @@ +import httplib, urllib2 +import re + +from flask import Flask, g + +from utility.logger import getLogger +logger = getLogger(__name__ ) + +class InfoPage(object): + def __init__(self, start_vars): + self.info = None + self.gn_accession_id = None + if 'gn_accession_id' in start_vars: + self.gn_accession_id = start_vars['gn_accession_id'] + self.info_page_name = start_vars['info_page_name'] + + self.get_info() + self.get_datasets_list() + + def get_info(self, create=False): + query_base = ("SELECT InfoPageName, GN_AccesionId, Species.MenuName, Species.TaxonomyId, Tissue.Name, InbredSet.Name, " + + "GeneChip.GeneChipName, GeneChip.GeoPlatform, AvgMethod.Name, Datasets.DatasetName, Datasets.GeoSeries, " + + "Datasets.PublicationTitle, DatasetStatus.DatasetStatusName, Datasets.Summary, Datasets.AboutCases, " + + "Datasets.AboutTissue, Datasets.AboutDataProcessing, Datasets.Acknowledgment, Datasets.ExperimentDesign, " + + "Datasets.Contributors, Datasets.Citation, Datasets.Notes, Investigators.FirstName, Investigators.LastName, " + + "Investigators.Address, Investigators.City, Investigators.State, Investigators.ZipCode, Investigators.Country, " + + "Investigators.Phone, Investigators.Email, Investigators.Url, Organizations.OrganizationName, " + + "InvestigatorId, DatasetId, DatasetStatusId, Datasets.AboutPlatform, InfoFileTitle, Specifics " + + "FROM InfoFiles " + + "LEFT JOIN Species USING (SpeciesId) " + + "LEFT JOIN Tissue USING (TissueId) " + + "LEFT JOIN InbredSet USING (InbredSetId) " + + "LEFT JOIN GeneChip USING (GeneChipId) " + + "LEFT JOIN AvgMethod USING (AvgMethodId) " + + "LEFT JOIN Datasets USING (DatasetId) " + + "LEFT JOIN Investigators USING (InvestigatorId) " + + "LEFT JOIN Organizations USING (OrganizationId) " + + "LEFT JOIN DatasetStatus USING (DatasetStatusId) WHERE ") + + if self.gn_accession_id: + final_query = query_base + "GN_AccesionId = {}".format(self.gn_accession_id) + results = g.db.execute(final_query).fetchone() + if self.info_page_name and not results: + final_query = query_base + "InfoPageName={}".format(self.info_page_name) + elif self.info_page_name: + final_query = query_base + "InfoPageName={}".format(self.info_page_name) + results = g.db.execute(final_query).fetchone() + else: + raise 'No correct parameter found' + + if results: + self.info = process_query_results(results) + + if (not results or len(results) < 1) and self.info_page_name and create: + insert_sql = "INSERT INTO InfoFiles SET InfoFiles.InfoPageName={}".format(self.info_page_name) + return self.get_info() + + if not self.gn_accession_id and self.info: + self.gn_accession_id = self.info['accession_id'] + if not self.info_page_name and self.info: + self.info_page_name = self.info['info_page_name'] + + def get_datasets_list(self): + self.filelist = [] + try: + response = urllib2.urlopen("http://datafiles.genenetwork.org/download/GN%s" % self.gn_accession_id) + data = response.read() + + matches = re.findall(r".+?", data, re.DOTALL) + for i, match in enumerate(matches): + if i == 0: + continue + cells = re.findall(r".+?", match, re.DOTALL) + full_filename = re.search(r"(.+?)<", cells[2]).group(1).strip() + filedate = "N/A" #ZS: Since we can't get it for now + + self.filelist.append([filename, filedate, filesize]) + except Exception, e: + pass + +def process_query_results(results): + info_ob = { + 'info_page_name': results[0], + 'accession_id': results[1], + 'menu_name': results[2], + 'taxonomy_id': results[3], + 'tissue_name': results[4], + 'group_name': results[5], + 'gene_chip_name': results[6], + 'geo_platform': results[7], + 'avg_method_name': results[8], + 'dataset_name': results[9], + 'geo_series': results[10], + 'publication_title': results[11], + 'dataset_status_name': results[12], + 'dataset_summary': results[13], + 'about_cases': results[14], + 'about_tissue': results[15], + 'about_data_processing': results[16], + 'acknowledgement': results[17], + 'experiment_design': results[18], + 'contributors': results[19], + 'citation': results[20], + 'notes': results[21], + 'investigator_firstname': results[22], + 'investigator_lastname': results[23], + 'investigator_address': results[24], + 'investigator_city': results[25], + 'investigator_state': results[26], + 'investigator_zipcode': results[27], + 'investigator_country': results[28], + 'investigator_phone': results[29], + 'investigator_email': results[30], + 'investigator_url': results[31], + 'organization_name': results[32], + 'investigator_id': results[33], + 'dataset_id': results[34], + 'dataset_status_is': results[35], + 'about_platform': results[36], + 'info_file_title': results[37], + 'specifics': results[38] + } + + return info_ob + \ No newline at end of file diff --git a/wqflask/wqflask/templates/info_page.html b/wqflask/wqflask/templates/info_page.html new file mode 100644 index 00000000..d8b7d74c --- /dev/null +++ b/wqflask/wqflask/templates/info_page.html @@ -0,0 +1,92 @@ +{% extends "base.html" %} +{% block title %}Policies{% endblock %} +{% block content %} + +

Data Set Group: {{ info.dataset_name }} + +{{ info.info_page_name }} +

+ + + + + +
+ + + + + + + + + + + +
Data Set: {{ info.info_file_title }}
GN Accession: GN{{ gn_accession_id }}
GEO Series: {{ info.geo_series }}
Title: {{ info.publication_title }}
Organism: {{ info.menu_name }}
Group: {{ info.group_name }}
Tissue: {{ info.tissue_name }}
Dataset Status: {{ info.dataset_status_name }}
Platforms: {{ info.gene_chip_name }}
Normalization: {{ info.avg_method_name }}
+
+ + + + + + + + + + + + + + + + + +
Contact Information
+ {{ info.investigator_first_name }} {{ info.inveestigator_last_name }}
+ {{ info.organization_name }}
+ {{ info.investigator_address }}
+ {{ info.investigator_city }}, {{ info.investigator_state }} {{ info.investigator_zipcode }} {{ info.investigator_country }}
+ Tel. {{ info.investigator_phone }}
+ {{ info.investigator_email }}
+ Website +
Download datasets and supplementary data files
+ +
+
+
+
+

+ + + + + + + + + + + + + + + + + + + + + + + + + +
Specifics of this Data Set:
{{ info.specifics|safe }}

Summary:
{{ info.dataset_summary|safe }}

About the cases used to generate this set of data:
{{ info.about_cases|safe }}

About the tissue used to generate this set of data:
{{ info.about_tissue|safe }}

About the array platform:
{{ info.about_platform|safe }}

About data values and data processing:
{{ info.about_data_processing|safe }}

Notes:
{{ info.notes|safe }}

Experiment Type:
{{ info.experiment_design|safe }}

Contributor:
{{ info.contributors|safe }}

Citation:
{{ info.citation|safe }}

Data source acknowledgment:
{{ info.acknowledgement|safe }}

Study Id:
{{ info.dataset_id }}

+

+ +{% endblock %} \ No newline at end of file -- cgit v1.2.3