diff options
Diffstat (limited to 'gn2/wqflask/db_info.py')
-rw-r--r-- | gn2/wqflask/db_info.py | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/gn2/wqflask/db_info.py b/gn2/wqflask/db_info.py new file mode 100644 index 00000000..f6b94dde --- /dev/null +++ b/gn2/wqflask/db_info.py @@ -0,0 +1,115 @@ +import urllib.request +import urllib.error +import urllib.parse +import re + +from MySQLdb.cursors import DictCursor +from gn2.wqflask.database import database_connection +from gn2.utility.tools import get_setting + + +class InfoPage: + def __init__(self, start_vars): + self.info = None + self.gn_accession_id = None + if 'gn_accession_id' in start_vars: + self.gn_accession_id = start_vars['gn_accession_id'] + self.info_page_name = start_vars['info_page_name'] + + self.get_info() + self.get_datasets_list() + + def get_info(self, create=False): + query_base = ( + "SELECT InfoPageName AS info_page_name, " + "GN_AccesionId AS accession_id, " + "Species.MenuName AS menu_name, " + "Species.TaxonomyId AS taxonomy_id, " + "Tissue.Name AS tissue_name, " + "InbredSet.Name AS group_name, " + "GeneChip.GeneChipName AS gene_chip_name, " + "GeneChip.GeoPlatform AS geo_platform, " + "AvgMethod.Name AS avg_method_name, " + "Datasets.DatasetName AS dataset_name, " + "Datasets.GeoSeries AS geo_series, " + "Datasets.PublicationTitle AS publication_title, " + "DatasetStatus.DatasetStatusName AS dataset_status_name, " + "Datasets.Summary AS dataset_summary, " + "Datasets.AboutCases AS about_cases, " + "Datasets.AboutTissue AS about_tissue, " + "Datasets.AboutDataProcessing AS about_data_processing, " + "Datasets.Acknowledgment AS acknowledgement, " + "Datasets.ExperimentDesign AS experiment_design, " + "Datasets.Contributors AS contributors, " + "Datasets.Citation AS citation, " + "Datasets.Notes AS notes, " + "Investigators.FirstName AS investigator_firstname, " + "Investigators.LastName AS investigator_lastname, " + "Investigators.Address AS investigator_address, " + "Investigators.City AS investigator_city, " + "Investigators.State AS investigator_state, " + "Investigators.ZipCode AS investigator_zipcode, " + "Investigators.Country AS investigator_country, " + "Investigators.Phone AS investigator_phone, " + "Investigators.Email AS investigator_email, " + "Investigators.Url AS investigator_url, " + "Organizations.OrganizationName AS organization_name, " + "InvestigatorId AS investigator_id, " + "DatasetId AS dataset_id, " + "DatasetStatusId AS dataset_status_id, " + "Datasets.AboutPlatform AS about_platform, " + "InfoFileTitle AS info_file_title, " + "Specifics AS specifics" + "FROM InfoFiles " + "LEFT JOIN Species USING (SpeciesId) " + "LEFT JOIN Tissue USING (TissueId) " + "LEFT JOIN InbredSet USING (InbredSetId) " + "LEFT JOIN GeneChip USING (GeneChipId) " + "LEFT JOIN AvgMethod USING (AvgMethodId) " + "LEFT JOIN Datasets USING (DatasetId) " + "LEFT JOIN Investigators USING (InvestigatorId) " + "LEFT JOIN Organizations USING (OrganizationId) " + "LEFT JOIN DatasetStatus USING (DatasetStatusId) WHERE " + ) + if not all([self.gn_accession_id, self.info_page_name]): + raise ValueError('No correct parameter found') + + results = {} + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor(DictCursor) as cursor: + if self.gn_accession_id: + cursor.execute(f"{query_base}GN_AccesionId = %s", + (self.gn_accession_id,)) + elif self.info_page_name: + cursor.execute(f"{query_base}InfoPageName = %s", + (self.info_page_name,)) + if (results := cursor.fetchone()): + self.info = results + if ((not results or len(results) < 1) + and self.info_page_name and create): + return self.get_info() + if not self.gn_accession_id and self.info: + self.gn_accession_id = self.info['accession_id'] + if not self.info_page_name and self.info: + self.info_page_name = self.info['info_page_name'] + + def get_datasets_list(self): + self.filelist = [] + try: + response = urllib.request.urlopen( + "https://files.genenetwork.org/current/GN%s" % self.gn_accession_id) + data = response.read() + + matches = re.findall(r"<tr>.+?</tr>", data, re.DOTALL) + for i, match in enumerate(matches): + if i == 0: + continue + cells = re.findall(r"<td.+?>.+?</td>", match, re.DOTALL) + full_filename = re.search( + r"<a href=\"(.+?)\"", cells[1], re.DOTALL).group(1).strip() + filename = full_filename.split("/")[-1] + filesize = re.search(r">(.+?)<", cells[2]).group(1).strip() + filedate = "N/A" # ZS: Since we can't get it for now + + self.filelist.append([filename, filedate, filesize]) + except Exception as e: + pass |