aboutsummaryrefslogtreecommitdiff
path: root/gn2/wqflask/db_info.py
diff options
context:
space:
mode:
Diffstat (limited to 'gn2/wqflask/db_info.py')
-rw-r--r--gn2/wqflask/db_info.py115
1 files changed, 115 insertions, 0 deletions
diff --git a/gn2/wqflask/db_info.py b/gn2/wqflask/db_info.py
new file mode 100644
index 00000000..f6b94dde
--- /dev/null
+++ b/gn2/wqflask/db_info.py
@@ -0,0 +1,115 @@
+import urllib.request
+import urllib.error
+import urllib.parse
+import re
+
+from MySQLdb.cursors import DictCursor
+from gn2.wqflask.database import database_connection
+from gn2.utility.tools import get_setting
+
+
+class InfoPage:
+ def __init__(self, start_vars):
+ self.info = None
+ self.gn_accession_id = None
+ if 'gn_accession_id' in start_vars:
+ self.gn_accession_id = start_vars['gn_accession_id']
+ self.info_page_name = start_vars['info_page_name']
+
+ self.get_info()
+ self.get_datasets_list()
+
+ def get_info(self, create=False):
+ query_base = (
+ "SELECT InfoPageName AS info_page_name, "
+ "GN_AccesionId AS accession_id, "
+ "Species.MenuName AS menu_name, "
+ "Species.TaxonomyId AS taxonomy_id, "
+ "Tissue.Name AS tissue_name, "
+ "InbredSet.Name AS group_name, "
+ "GeneChip.GeneChipName AS gene_chip_name, "
+ "GeneChip.GeoPlatform AS geo_platform, "
+ "AvgMethod.Name AS avg_method_name, "
+ "Datasets.DatasetName AS dataset_name, "
+ "Datasets.GeoSeries AS geo_series, "
+ "Datasets.PublicationTitle AS publication_title, "
+ "DatasetStatus.DatasetStatusName AS dataset_status_name, "
+ "Datasets.Summary AS dataset_summary, "
+ "Datasets.AboutCases AS about_cases, "
+ "Datasets.AboutTissue AS about_tissue, "
+ "Datasets.AboutDataProcessing AS about_data_processing, "
+ "Datasets.Acknowledgment AS acknowledgement, "
+ "Datasets.ExperimentDesign AS experiment_design, "
+ "Datasets.Contributors AS contributors, "
+ "Datasets.Citation AS citation, "
+ "Datasets.Notes AS notes, "
+ "Investigators.FirstName AS investigator_firstname, "
+ "Investigators.LastName AS investigator_lastname, "
+ "Investigators.Address AS investigator_address, "
+ "Investigators.City AS investigator_city, "
+ "Investigators.State AS investigator_state, "
+ "Investigators.ZipCode AS investigator_zipcode, "
+ "Investigators.Country AS investigator_country, "
+ "Investigators.Phone AS investigator_phone, "
+ "Investigators.Email AS investigator_email, "
+ "Investigators.Url AS investigator_url, "
+ "Organizations.OrganizationName AS organization_name, "
+ "InvestigatorId AS investigator_id, "
+ "DatasetId AS dataset_id, "
+ "DatasetStatusId AS dataset_status_id, "
+ "Datasets.AboutPlatform AS about_platform, "
+ "InfoFileTitle AS info_file_title, "
+ "Specifics AS specifics"
+ "FROM InfoFiles "
+ "LEFT JOIN Species USING (SpeciesId) "
+ "LEFT JOIN Tissue USING (TissueId) "
+ "LEFT JOIN InbredSet USING (InbredSetId) "
+ "LEFT JOIN GeneChip USING (GeneChipId) "
+ "LEFT JOIN AvgMethod USING (AvgMethodId) "
+ "LEFT JOIN Datasets USING (DatasetId) "
+ "LEFT JOIN Investigators USING (InvestigatorId) "
+ "LEFT JOIN Organizations USING (OrganizationId) "
+ "LEFT JOIN DatasetStatus USING (DatasetStatusId) WHERE "
+ )
+ if not all([self.gn_accession_id, self.info_page_name]):
+ raise ValueError('No correct parameter found')
+
+ results = {}
+ with database_connection(get_setting("SQL_URI")) as conn, conn.cursor(DictCursor) as cursor:
+ if self.gn_accession_id:
+ cursor.execute(f"{query_base}GN_AccesionId = %s",
+ (self.gn_accession_id,))
+ elif self.info_page_name:
+ cursor.execute(f"{query_base}InfoPageName = %s",
+ (self.info_page_name,))
+ if (results := cursor.fetchone()):
+ self.info = results
+ if ((not results or len(results) < 1)
+ and self.info_page_name and create):
+ return self.get_info()
+ if not self.gn_accession_id and self.info:
+ self.gn_accession_id = self.info['accession_id']
+ if not self.info_page_name and self.info:
+ self.info_page_name = self.info['info_page_name']
+
+ def get_datasets_list(self):
+ self.filelist = []
+ try:
+ response = urllib.request.urlopen(
+ "https://files.genenetwork.org/current/GN%s" % self.gn_accession_id)
+ data = response.read()
+
+ matches = re.findall(r"<tr>.+?</tr>", data, re.DOTALL)
+ for i, match in enumerate(matches):
+ if i == 0:
+ continue
+ cells = re.findall(r"<td.+?>.+?</td>", match, re.DOTALL)
+ full_filename = re.search(
+ r"<a href=\"(.+?)\"", cells[1], re.DOTALL).group(1).strip()
+ filename = full_filename.split("/")[-1]
+ filesize = re.search(r">(.+?)<", cells[2]).group(1).strip()
+ filedate = "N/A" # ZS: Since we can't get it for now
+
+ self.filelist.append([filename, filedate, filesize])
+ except Exception as e:
+ pass