From 06986d1c3a9d76e9e4b0f0d2a7c89746a5a19990 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 9 Feb 2021 23:28:26 +0300 Subject: add check for broken links in genenetwork homepage --- test/requests/links_scraper/genelinks.py | 92 ++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 test/requests/links_scraper/genelinks.py diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py new file mode 100644 index 00000000..223a2c31 --- /dev/null +++ b/test/requests/links_scraper/genelinks.py @@ -0,0 +1,92 @@ +import re +import requests +import urllib3 +import os +import logging + +from urllib.request import urlopen as uReq +from bs4 import BeautifulSoup as soup +from urllib.parse import urljoin + + +PORT = os.environ.get("PORT", "5004") + + +def test_link(link, strict=True): + print(f"link testing {link}") + results = None + try: + + results = requests.get(link, verify=False, timeout=10) + + except Exception as e: + if strict: + raise SystemExit( + "The link does not exists or is wrongly formatted") + else: + logging.error(f"FAILED:{link} does not exists or is wrongly formatted") + + status_code = results.status_code if results is not None else "404" + + print(f'the link {link} ---> {status_code}') + + +def fetch_css_links(parsed_page): + print("fetching css links") + for link in parsed_page.findAll("link"): + full_path = None + + link_url = link.attrs.get("href") + if re.match(r"^http://", link_url): + pass + # not sure whether to raise an error here for external css links + + elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url): + full_path = urljoin('http://localhost:5004/', link_url) + + if full_path is not None: + test_link(full_path) + + +def fetch_html_links(parsed_page): + print("fetching a tags ") + + for link in parsed_page.findAll("a"): + full_path = None + link_url = link.attrs.get("href") + if re.match(r"^/", link_url): + full_path = urljoin('http://localhost:5004/', link_url) + + elif re.match(r'^http://', link_url): + full_path = link_url + + if full_path is not None: + test_link(full_path) + + +def fetch_script_tags(parsed_page): + print("--->fetching js links") + for link in parsed_page.findAll("script"): + js_link = link.attrs.get("src") + if js_link is not None: + if re.match(r'^http://', js_link): + raise SystemExit("Failed,the library should be packaged in guix.\ + Please contact,http://genenetwork.org/ for more details") + + elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link): + full_path = urljoin('http://localhost:5004/', js_link) + test_link(full_path) + + +def fetch_page_links(page_url): + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + html_page = uReq(page_url) + parsed_page = soup(html_page, "html.parser") + + fetch_script_tags(parsed_page=parsed_page) + fetch_css_links(parsed_page=parsed_page) + fetch_html_links(parsed_page=parsed_page) + + +fetch_page_links(f"http://localhost:{PORT}/") -- cgit v1.2.3 From cd5b32a69215bf5c168b7619ebc881908845204c Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Feb 2021 08:11:33 +0300 Subject: add broken links checker to workflow --- .github/workflows/main.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2342796a..79c69699 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -46,3 +46,22 @@ jobs: WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ etc/default_settings.py -c -m unittest discover -v + + - name: Start Genenetwork as a Background Task + run: | + env GN2_PROFILE=/gn2-profile \ + TMPDIR=/tmp SERVER_PORT=5004 \ + WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ + GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ + etc/default_settings.py + + - name: Test for Broken Links + run: | + + env GN2_PROFILE=/gn2-profile \ + TMPDIR=/tmp\ + WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ + GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ + etc/default_settings.py -c -m\ + /__w/genenetwork/genenetwork/test/requests/links_scraper/genelinks.py + -- cgit v1.2.3 From 02ac2fd38fea7f85b3ef89464157a7c0d1ffcac7 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Feb 2021 08:14:03 +0300 Subject: fix:starting genenetwork on the background --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 79c69699..a8642806 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -53,7 +53,7 @@ jobs: TMPDIR=/tmp SERVER_PORT=5004 \ WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ - etc/default_settings.py + etc/default_settings.py& - name: Test for Broken Links run: | -- cgit v1.2.3 From e67e6d7f88ffb21e4101147c5abadb2b7c78e5ae Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Feb 2021 08:18:47 +0300 Subject: fix error --- .github/workflows/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a8642806..d4649d4f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -62,6 +62,5 @@ jobs: TMPDIR=/tmp\ WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ - etc/default_settings.py -c -m\ - /__w/genenetwork/genenetwork/test/requests/links_scraper/genelinks.py + etc/default_settings.py -c -m /__w/genenetwork/genenetwork/test/requests/links_scraper/genelinks.py -- cgit v1.2.3 From f3a8fdc660504e0ea74ae63d5ed7c891db6e3963 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Feb 2021 08:20:18 +0300 Subject: fix paths issues --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d4649d4f..5d46ccb9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -62,5 +62,5 @@ jobs: TMPDIR=/tmp\ WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ - etc/default_settings.py -c -m /__w/genenetwork/genenetwork/test/requests/links_scraper/genelinks.py + etc/default_settings.py -c -m /__w/genenetwork2/genenetwork2/test/requests/links_scraper/genelinks.py -- cgit v1.2.3 From e1c3827d65a35d87e45cefe04135c1ff16374410 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 03:26:49 +0300 Subject: add links validator --- test/requests/links_scraper/genelinks.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 223a2c31..5dddcc47 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -7,11 +7,20 @@ import logging from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup from urllib.parse import urljoin +from urllib.parse import urlparse PORT = os.environ.get("PORT", "5004") +def is_valid_link(url_link): + try: + result = urlparse(url_link) + return all([result.scheme, result.netloc, result.path]) + except Exception as e: + return False + + def test_link(link, strict=True): print(f"link testing {link}") results = None @@ -57,7 +66,8 @@ def fetch_html_links(parsed_page): if re.match(r"^/", link_url): full_path = urljoin('http://localhost:5004/', link_url) - elif re.match(r'^http://', link_url): + elif is_valid_link(link_url): + print(link_url) full_path = link_url if full_path is not None: -- cgit v1.2.3 From 6f3273429482dea1aaaa4e5fe61b178485e271b8 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 07:26:04 +0300 Subject: edit workflow --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5d46ccb9..a36abc0a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -62,5 +62,5 @@ jobs: TMPDIR=/tmp\ WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ - etc/default_settings.py -c -m /__w/genenetwork2/genenetwork2/test/requests/links_scraper/genelinks.py + etc/default_settings.py -c /__w/genenetwork2/genenetwork2/test/requests/links_scraper/genelinks.py -- cgit v1.2.3 From 0cda88d6112ae0e4ae9ef5d4491a2d5695a07330 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 08:16:33 +0300 Subject: add check for url validity --- test/requests/links_scraper/genelinks.py | 61 ++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 5dddcc47..3b8ce230 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -12,6 +12,8 @@ from urllib.parse import urlparse PORT = os.environ.get("PORT", "5004") +BROKEN_LINKS = set() + def is_valid_link(url_link): try: @@ -21,23 +23,21 @@ def is_valid_link(url_link): return False -def test_link(link, strict=True): - print(f"link testing {link}") +def test_link(link): + print(f'Checking -->{link}') results = None try: results = requests.get(link, verify=False, timeout=10) + status_code = results.status_code except Exception as e: - if strict: - raise SystemExit( - "The link does not exists or is wrongly formatted") - else: - logging.error(f"FAILED:{link} does not exists or is wrongly formatted") + status_code = 408 - status_code = results.status_code if results is not None else "404" + if int(status_code) > 403: + return True - print(f'the link {link} ---> {status_code}') + return False def fetch_css_links(parsed_page): @@ -46,15 +46,15 @@ def fetch_css_links(parsed_page): full_path = None link_url = link.attrs.get("href") - if re.match(r"^http://", link_url): - pass - # not sure whether to raise an error here for external css links + if is_valid_link(link_url): + full_path = link_url elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url): full_path = urljoin('http://localhost:5004/', link_url) if full_path is not None: - test_link(full_path) + if test_link(full_path): + BROKEN_LINKS.add(full_path) def fetch_html_links(parsed_page): @@ -67,11 +67,11 @@ def fetch_html_links(parsed_page): full_path = urljoin('http://localhost:5004/', link_url) elif is_valid_link(link_url): - print(link_url) full_path = link_url if full_path is not None: - test_link(full_path) + if test_link(full_path): + BROKEN_LINKS.add(full_path) def fetch_script_tags(parsed_page): @@ -79,13 +79,14 @@ def fetch_script_tags(parsed_page): for link in parsed_page.findAll("script"): js_link = link.attrs.get("src") if js_link is not None: - if re.match(r'^http://', js_link): + if is_valid_link(js_link): raise SystemExit("Failed,the library should be packaged in guix.\ Please contact,http://genenetwork.org/ for more details") elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link): full_path = urljoin('http://localhost:5004/', js_link) - test_link(full_path) + if test_link(full_path): + BROKEN_LINKS.add(full_path) def fetch_page_links(page_url): @@ -99,4 +100,28 @@ def fetch_page_links(page_url): fetch_html_links(parsed_page=parsed_page) -fetch_page_links(f"http://localhost:{PORT}/") +def webpages_to_check(): + pages = [ + + "http://localhost:/5004", + + + + + + + ] + + return pages + + +if __name__ == '__main__': + for page in webpages_to_check(): + fetch_page_links(f"http://localhost:{PORT}/") + if BROKEN_LINKS is not None: + print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>") + for link in BROKEN_LINKS: + print(link) + + raise SystemExit( + "The links Above are broken.Please contact genenetwork.org<<<<<<<<") -- cgit v1.2.3 From 5f4ad48a7afaca3cf34266c1012efedb7ef2ec46 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 08:43:37 +0300 Subject: pep8 formatting --- test/requests/links_scraper/genelinks.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 3b8ce230..00a71d57 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -101,23 +101,14 @@ def fetch_page_links(page_url): def webpages_to_check(): - pages = [ - - "http://localhost:/5004", - - - - - - - ] + pages = [f"http://localhost:{PORT}/"] return pages if __name__ == '__main__': for page in webpages_to_check(): - fetch_page_links(f"http://localhost:{PORT}/") + fetch_page_links(page) if BROKEN_LINKS is not None: print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>") for link in BROKEN_LINKS: -- cgit v1.2.3 From a4e057917a42b073bcbe70d9ba80dd48ee56b618 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 21:47:41 +0300 Subject: add styles for broken links --- wqflask/wqflask/static/new/css/broken_links.css | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 wqflask/wqflask/static/new/css/broken_links.css diff --git a/wqflask/wqflask/static/new/css/broken_links.css b/wqflask/wqflask/static/new/css/broken_links.css new file mode 100644 index 00000000..676f32d9 --- /dev/null +++ b/wqflask/wqflask/static/new/css/broken_links.css @@ -0,0 +1,5 @@ + +.broken_link{ + color:red; + text-decoration: underline; +} \ No newline at end of file -- cgit v1.2.3 From 03cdfbe9f59414cd5c1d44a7be8c69a41c469930 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 21:48:12 +0300 Subject: replace broken links with text --- wqflask/wqflask/templates/base.html | 10 ++++++---- wqflask/wqflask/templates/index_page_orig.html | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/wqflask/wqflask/templates/base.html b/wqflask/wqflask/templates/base.html index ec500d1e..ccb2ac5a 100644 --- a/wqflask/wqflask/templates/base.html +++ b/wqflask/wqflask/templates/base.html @@ -21,6 +21,8 @@ + + {% block css %} {% endblock %} @@ -80,9 +82,9 @@ Tools
@@ -197,7 +199,7 @@ (P20-DA 21131, 2001-2012)- Development and source code on github with issue tracker and documentation. Join the mailing list and find us on IRC (#genenetwork channel). + Development and source code on github with issue tracker and documentation. Join the mailing list and find us on IRC (#genenetwork channel). {% if version: %}
GeneNetwork {{ version }}
{% endif %} diff --git a/wqflask/wqflask/templates/index_page_orig.html b/wqflask/wqflask/templates/index_page_orig.html index 16caa30b..7f82b35c 100755 --- a/wqflask/wqflask/templates/index_page_orig.html +++ b/wqflask/wqflask/templates/index_page_orig.html @@ -193,7 +193,7 @@The entire procedure can be reapplied once the initial outlier data sets have been eliminated to detect any remaining outlier data sets. -
DataDesk was used to examine the statistical quality of the probe level (CEL) data after step 5 below. DataDesk allows the rapid detection of subsets of probes that are particularly sensitive to still unknown factors in array processing. Arrays can then be categorized at the probe level into "reaction classes." A reaction class is a group of arrays for which the expression of essentially all probes are colinear over the full range of log2 values. A single but large group of arrays (n = 32) processed in essentially the identical manner by a single operator can produce arrays belonging to as many as four different reaction classes. Reaction classes are NOT related to strain, age, sex, treatment, or any known biological parameter (technical replicates can belong to different reaction classes). We do not yet understand the technical origins of reaction classes. The number of probes that contribute to the definition of reaction classes is quite small (<10% of all probes). We have categorized all arrays in this data set into one of 5 reaction classes. These have then been treated as if they were separate batches. Probes in these data type "batches" have been aligned to a common mean as described below. +
DataDesk was used to examine the statistical quality of the probe level (CEL) data after step 5 below. DataDesk allows the rapid detection of subsets of probes that are particularly sensitive to still unknown factors in array processing. Arrays can then be categorized at the probe level into "reaction classes." A reaction class is a group of arrays for which the expression of essentially all probes are colinear over the full range of log2 values. A single but large group of arrays (n = 32) processed in essentially the identical manner by a single operator can produce arrays belonging to as many as four different reaction classes. Reaction classes are NOT related to strain, age, sex, treatment, or any known biological parameter (technical replicates can belong to different reaction classes). We do not yet understand the technical origins of reaction classes. The number of probes that contribute to the definition of reaction classes is quite small (<10% of all probes). We have categorized all arrays in this data set into one of 5 reaction classes. These have then been treated as if they were separate batches. Probes in these data type "batches" have been aligned to a common mean as described below. -
Probe (cell) level data from the CEL file: These CEL values produced by GCOS are 75% quantiles from a set of 91 pixel values per cell. +
Probe (cell) level data from the CEL file: These CEL values produced by GCOS are 75% quantiles from a set of 91 pixel values per cell.
Pooled RNA samples (usually one pool of male hippocampii and one pool of female hippocampii) were prepared using standard protocols. Samples were processed using a total of 206 Affymetrix GeneChip Mouse Expression 430 2.0 short oligomer arrays (MOE430 2.0 or M430v2; see GEO platform ID GPL1261), of which 201 passed quality control and error checking. This particular data set was processed using the PDNN protocol. To simplify comparisons among transforms, PDNN values of each array were adjusted to an average of 8 units and a standard deviation of 2 units. +
Pooled RNA samples (usually one pool of male hippocampii and one pool of female hippocampii) were prepared using standard protocols. Samples were processed using a total of 206 Affymetrix GeneChip Mouse Expression 430 2.0 short oligomer arrays (MOE430 2.0 or M430v2; see GEO platform ID GPL1261), of which 201 passed quality control and error checking. This particular data set was processed using the PDNN protocol. To simplify comparisons among transforms, PDNN values of each array were adjusted to an average of 8 units and a standard deviation of 2 units.
Pooled RNA samples (usually one pool of male hippocampii and one pool of female hippocampii) were prepared using standard protocols. Samples were processed using a total of 206 Affymetrix GeneChip Mouse Expression 430 2.0 short oligomer arrays (MOE430 2.0 or M430v2; see GEO platform ID GPL1261), of which 201 passed quality control and error checking. This particular data set was processed using the PDNN protocol. To simplify comparisons among transforms, PDNN values of each array were adjusted to an average of 8 units and a standard deviation of 2 units. +
Pooled RNA samples (usually one pool of male hippocampii and one pool of female hippocampii) were prepared using standard protocols. Samples were processed using a total of 206 Affymetrix GeneChip Mouse Expression 430 2.0 short oligomer arrays (MOE430 2.0 or M430v2; see GEO platform ID GPL1261), of which 201 passed quality control and error checking. This particular data set was processed using the PDNN protocol. To simplify comparisons among transforms, PDNN values of each array were adjusted to an average of 8 units and a standard deviation of 2 units.
Please cite: Overall RW, Kempermann G, Peirce J, Lu L, Goldowitz D, Gage FH, Goodwin S, Smit AB, Airey DC, Rosen GD, Schalkwyk LC, Sutter TR, Nowakowski RS, Whatley S, Williams RW (2009) Genetics of the hippocampal transcriptome in mice: a systematic survey and online neurogenomic resource. Front. Neurogen. 1:3 Full Text HTML doi:10.3389/neuro.15.003.2009 +
Please cite: Overall RW, Kempermann G, Peirce J, Lu L, Goldowitz D, Gage FH, Goodwin S, Smit AB, Airey DC, Rosen GD, Schalkwyk LC, Sutter TR, Nowakowski RS, Whatley S, Williams RW (2009) Genetics of the hippocampal transcriptome in mice: a systematic survey and online neurogenomic resource. Front. Neurogen. 1:3 Full Text HTML doi:10.3389/neuro.15.003.2009