From 06986d1c3a9d76e9e4b0f0d2a7c89746a5a19990 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 9 Feb 2021 23:28:26 +0300 Subject: add check for broken links in genenetwork homepage --- test/requests/links_scraper/genelinks.py | 92 ++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 test/requests/links_scraper/genelinks.py (limited to 'test/requests/links_scraper/genelinks.py') diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py new file mode 100644 index 00000000..223a2c31 --- /dev/null +++ b/test/requests/links_scraper/genelinks.py @@ -0,0 +1,92 @@ +import re +import requests +import urllib3 +import os +import logging + +from urllib.request import urlopen as uReq +from bs4 import BeautifulSoup as soup +from urllib.parse import urljoin + + +PORT = os.environ.get("PORT", "5004") + + +def test_link(link, strict=True): + print(f"link testing {link}") + results = None + try: + + results = requests.get(link, verify=False, timeout=10) + + except Exception as e: + if strict: + raise SystemExit( + "The link does not exists or is wrongly formatted") + else: + logging.error(f"FAILED:{link} does not exists or is wrongly formatted") + + status_code = results.status_code if results is not None else "404" + + print(f'the link {link} ---> {status_code}') + + +def fetch_css_links(parsed_page): + print("fetching css links") + for link in parsed_page.findAll("link"): + full_path = None + + link_url = link.attrs.get("href") + if re.match(r"^http://", link_url): + pass + # not sure whether to raise an error here for external css links + + elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url): + full_path = urljoin('http://localhost:5004/', link_url) + + if full_path is not None: + test_link(full_path) + + +def fetch_html_links(parsed_page): + print("fetching a tags ") + + for link in parsed_page.findAll("a"): + full_path = None + link_url = link.attrs.get("href") + if re.match(r"^/", link_url): + full_path = urljoin('http://localhost:5004/', link_url) + + elif re.match(r'^http://', link_url): + full_path = link_url + + if full_path is not None: + test_link(full_path) + + +def fetch_script_tags(parsed_page): + print("--->fetching js links") + for link in parsed_page.findAll("script"): + js_link = link.attrs.get("src") + if js_link is not None: + if re.match(r'^http://', js_link): + raise SystemExit("Failed,the library should be packaged in guix.\ + Please contact,http://genenetwork.org/ for more details") + + elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link): + full_path = urljoin('http://localhost:5004/', js_link) + test_link(full_path) + + +def fetch_page_links(page_url): + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + html_page = uReq(page_url) + parsed_page = soup(html_page, "html.parser") + + fetch_script_tags(parsed_page=parsed_page) + fetch_css_links(parsed_page=parsed_page) + fetch_html_links(parsed_page=parsed_page) + + +fetch_page_links(f"http://localhost:{PORT}/") -- cgit v1.2.3