From 06986d1c3a9d76e9e4b0f0d2a7c89746a5a19990 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 9 Feb 2021 23:28:26 +0300 Subject: add check for broken links in genenetwork homepage --- test/requests/links_scraper/genelinks.py | 92 ++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 test/requests/links_scraper/genelinks.py diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py new file mode 100644 index 00000000..223a2c31 --- /dev/null +++ b/test/requests/links_scraper/genelinks.py @@ -0,0 +1,92 @@ +import re +import requests +import urllib3 +import os +import logging + +from urllib.request import urlopen as uReq +from bs4 import BeautifulSoup as soup +from urllib.parse import urljoin + + +PORT = os.environ.get("PORT", "5004") + + +def test_link(link, strict=True): + print(f"link testing {link}") + results = None + try: + + results = requests.get(link, verify=False, timeout=10) + + except Exception as e: + if strict: + raise SystemExit( + "The link does not exists or is wrongly formatted") + else: + logging.error(f"FAILED:{link} does not exists or is wrongly formatted") + + status_code = results.status_code if results is not None else "404" + + print(f'the link {link} ---> {status_code}') + + +def fetch_css_links(parsed_page): + print("fetching css links") + for link in parsed_page.findAll("link"): + full_path = None + + link_url = link.attrs.get("href") + if re.match(r"^http://", link_url): + pass + # not sure whether to raise an error here for external css links + + elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url): + full_path = urljoin('http://localhost:5004/', link_url) + + if full_path is not None: + test_link(full_path) + + +def fetch_html_links(parsed_page): + print("fetching a tags ") + + for link in parsed_page.findAll("a"): + full_path = None + link_url = link.attrs.get("href") + if re.match(r"^/", link_url): + full_path = urljoin('http://localhost:5004/', link_url) + + elif re.match(r'^http://', link_url): + full_path = link_url + + if full_path is not None: + test_link(full_path) + + +def fetch_script_tags(parsed_page): + print("--->fetching js links") + for link in parsed_page.findAll("script"): + js_link = link.attrs.get("src") + if js_link is not None: + if re.match(r'^http://', js_link): + raise SystemExit("Failed,the library should be packaged in guix.\ + Please contact,http://genenetwork.org/ for more details") + + elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link): + full_path = urljoin('http://localhost:5004/', js_link) + test_link(full_path) + + +def fetch_page_links(page_url): + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + html_page = uReq(page_url) + parsed_page = soup(html_page, "html.parser") + + fetch_script_tags(parsed_page=parsed_page) + fetch_css_links(parsed_page=parsed_page) + fetch_html_links(parsed_page=parsed_page) + + +fetch_page_links(f"http://localhost:{PORT}/") -- cgit v1.2.3 From cd5b32a69215bf5c168b7619ebc881908845204c Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Feb 2021 08:11:33 +0300 Subject: add broken links checker to workflow --- .github/workflows/main.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2342796a..79c69699 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -46,3 +46,22 @@ jobs: WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ etc/default_settings.py -c -m unittest discover -v + + - name: Start Genenetwork as a Background Task + run: | + env GN2_PROFILE=/gn2-profile \ + TMPDIR=/tmp SERVER_PORT=5004 \ + WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ + GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ + etc/default_settings.py + + - name: Test for Broken Links + run: | + + env GN2_PROFILE=/gn2-profile \ + TMPDIR=/tmp\ + WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ + GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ + etc/default_settings.py -c -m\ + /__w/genenetwork/genenetwork/test/requests/links_scraper/genelinks.py + -- cgit v1.2.3 From 02ac2fd38fea7f85b3ef89464157a7c0d1ffcac7 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Feb 2021 08:14:03 +0300 Subject: fix:starting genenetwork on the background --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 79c69699..a8642806 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -53,7 +53,7 @@ jobs: TMPDIR=/tmp SERVER_PORT=5004 \ WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ - etc/default_settings.py + etc/default_settings.py& - name: Test for Broken Links run: | -- cgit v1.2.3 From e67e6d7f88ffb21e4101147c5abadb2b7c78e5ae Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Feb 2021 08:18:47 +0300 Subject: fix error --- .github/workflows/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a8642806..d4649d4f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -62,6 +62,5 @@ jobs: TMPDIR=/tmp\ WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ - etc/default_settings.py -c -m\ - /__w/genenetwork/genenetwork/test/requests/links_scraper/genelinks.py + etc/default_settings.py -c -m /__w/genenetwork/genenetwork/test/requests/links_scraper/genelinks.py -- cgit v1.2.3 From f3a8fdc660504e0ea74ae63d5ed7c891db6e3963 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Feb 2021 08:20:18 +0300 Subject: fix paths issues --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d4649d4f..5d46ccb9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -62,5 +62,5 @@ jobs: TMPDIR=/tmp\ WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ - etc/default_settings.py -c -m /__w/genenetwork/genenetwork/test/requests/links_scraper/genelinks.py + etc/default_settings.py -c -m /__w/genenetwork2/genenetwork2/test/requests/links_scraper/genelinks.py -- cgit v1.2.3 From e1c3827d65a35d87e45cefe04135c1ff16374410 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 03:26:49 +0300 Subject: add links validator --- test/requests/links_scraper/genelinks.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 223a2c31..5dddcc47 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -7,11 +7,20 @@ import logging from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup from urllib.parse import urljoin +from urllib.parse import urlparse PORT = os.environ.get("PORT", "5004") +def is_valid_link(url_link): + try: + result = urlparse(url_link) + return all([result.scheme, result.netloc, result.path]) + except Exception as e: + return False + + def test_link(link, strict=True): print(f"link testing {link}") results = None @@ -57,7 +66,8 @@ def fetch_html_links(parsed_page): if re.match(r"^/", link_url): full_path = urljoin('http://localhost:5004/', link_url) - elif re.match(r'^http://', link_url): + elif is_valid_link(link_url): + print(link_url) full_path = link_url if full_path is not None: -- cgit v1.2.3 From 6f3273429482dea1aaaa4e5fe61b178485e271b8 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 07:26:04 +0300 Subject: edit workflow --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5d46ccb9..a36abc0a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -62,5 +62,5 @@ jobs: TMPDIR=/tmp\ WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG \ GENENETWORK_FILES=/genotype_files/ bin/genenetwork2 \ - etc/default_settings.py -c -m /__w/genenetwork2/genenetwork2/test/requests/links_scraper/genelinks.py + etc/default_settings.py -c /__w/genenetwork2/genenetwork2/test/requests/links_scraper/genelinks.py -- cgit v1.2.3 From 0cda88d6112ae0e4ae9ef5d4491a2d5695a07330 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 08:16:33 +0300 Subject: add check for url validity --- test/requests/links_scraper/genelinks.py | 61 ++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 5dddcc47..3b8ce230 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -12,6 +12,8 @@ from urllib.parse import urlparse PORT = os.environ.get("PORT", "5004") +BROKEN_LINKS = set() + def is_valid_link(url_link): try: @@ -21,23 +23,21 @@ def is_valid_link(url_link): return False -def test_link(link, strict=True): - print(f"link testing {link}") +def test_link(link): + print(f'Checking -->{link}') results = None try: results = requests.get(link, verify=False, timeout=10) + status_code = results.status_code except Exception as e: - if strict: - raise SystemExit( - "The link does not exists or is wrongly formatted") - else: - logging.error(f"FAILED:{link} does not exists or is wrongly formatted") + status_code = 408 - status_code = results.status_code if results is not None else "404" + if int(status_code) > 403: + return True - print(f'the link {link} ---> {status_code}') + return False def fetch_css_links(parsed_page): @@ -46,15 +46,15 @@ def fetch_css_links(parsed_page): full_path = None link_url = link.attrs.get("href") - if re.match(r"^http://", link_url): - pass - # not sure whether to raise an error here for external css links + if is_valid_link(link_url): + full_path = link_url elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url): full_path = urljoin('http://localhost:5004/', link_url) if full_path is not None: - test_link(full_path) + if test_link(full_path): + BROKEN_LINKS.add(full_path) def fetch_html_links(parsed_page): @@ -67,11 +67,11 @@ def fetch_html_links(parsed_page): full_path = urljoin('http://localhost:5004/', link_url) elif is_valid_link(link_url): - print(link_url) full_path = link_url if full_path is not None: - test_link(full_path) + if test_link(full_path): + BROKEN_LINKS.add(full_path) def fetch_script_tags(parsed_page): @@ -79,13 +79,14 @@ def fetch_script_tags(parsed_page): for link in parsed_page.findAll("script"): js_link = link.attrs.get("src") if js_link is not None: - if re.match(r'^http://', js_link): + if is_valid_link(js_link): raise SystemExit("Failed,the library should be packaged in guix.\ Please contact,http://genenetwork.org/ for more details") elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link): full_path = urljoin('http://localhost:5004/', js_link) - test_link(full_path) + if test_link(full_path): + BROKEN_LINKS.add(full_path) def fetch_page_links(page_url): @@ -99,4 +100,28 @@ def fetch_page_links(page_url): fetch_html_links(parsed_page=parsed_page) -fetch_page_links(f"http://localhost:{PORT}/") +def webpages_to_check(): + pages = [ + + "http://localhost:/5004", + + + + + + + ] + + return pages + + +if __name__ == '__main__': + for page in webpages_to_check(): + fetch_page_links(f"http://localhost:{PORT}/") + if BROKEN_LINKS is not None: + print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>") + for link in BROKEN_LINKS: + print(link) + + raise SystemExit( + "The links Above are broken.Please contact genenetwork.org<<<<<<<<") -- cgit v1.2.3 From 5f4ad48a7afaca3cf34266c1012efedb7ef2ec46 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 08:43:37 +0300 Subject: pep8 formatting --- test/requests/links_scraper/genelinks.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 3b8ce230..00a71d57 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -101,23 +101,14 @@ def fetch_page_links(page_url): def webpages_to_check(): - pages = [ - - "http://localhost:/5004", - - - - - - - ] + pages = [f"http://localhost:{PORT}/"] return pages if __name__ == '__main__': for page in webpages_to_check(): - fetch_page_links(f"http://localhost:{PORT}/") + fetch_page_links(page) if BROKEN_LINKS is not None: print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>") for link in BROKEN_LINKS: -- cgit v1.2.3 From a4e057917a42b073bcbe70d9ba80dd48ee56b618 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 21:47:41 +0300 Subject: add styles for broken links --- wqflask/wqflask/static/new/css/broken_links.css | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 wqflask/wqflask/static/new/css/broken_links.css diff --git a/wqflask/wqflask/static/new/css/broken_links.css b/wqflask/wqflask/static/new/css/broken_links.css new file mode 100644 index 00000000..676f32d9 --- /dev/null +++ b/wqflask/wqflask/static/new/css/broken_links.css @@ -0,0 +1,5 @@ + +.broken_link{ + color:red; + text-decoration: underline; +} \ No newline at end of file -- cgit v1.2.3 From 03cdfbe9f59414cd5c1d44a7be8c69a41c469930 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 21:48:12 +0300 Subject: replace broken links with text --- wqflask/wqflask/templates/base.html | 10 ++++++---- wqflask/wqflask/templates/index_page_orig.html | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/wqflask/wqflask/templates/base.html b/wqflask/wqflask/templates/base.html index ec500d1e..ccb2ac5a 100644 --- a/wqflask/wqflask/templates/base.html +++ b/wqflask/wqflask/templates/base.html @@ -21,6 +21,8 @@ + + {% block css %} {% endblock %} @@ -80,9 +82,9 @@ @@ -197,7 +199,7 @@ (P20-DA 21131, 2001-2012)
  • - NCI MMHCC (U01CA105417), NCRR, BIRN, (U24 RR021760) + NCI MMHCC (U01CA105417), NCRR, BIRN, (U24 RR021760)
  • @@ -205,7 +207,7 @@ JOSS

    - Development and source code on github with issue tracker and documentation. Join the mailing list and find us on IRC (#genenetwork channel). + Development and source code on github with issue tracker and documentation. Join the mailing list and find us on IRC (#genenetwork channel). {% if version: %}

    GeneNetwork {{ version }}

    {% endif %} diff --git a/wqflask/wqflask/templates/index_page_orig.html b/wqflask/wqflask/templates/index_page_orig.html index 16caa30b..7f82b35c 100755 --- a/wqflask/wqflask/templates/index_page_orig.html +++ b/wqflask/wqflask/templates/index_page_orig.html @@ -193,7 +193,7 @@

    Affiliates