add check for broken links in genenetwork homepage

author: Alexander Kabui 2021-02-09 23:28:26 +0300
committer: BonfaceKilz 2021-02-13 16:55:08 +0300
commit: 06986d1c3a9d76e9e4b0f0d2a7c89746a5a19990 (patch)
tree: 6cf4e1b104bdf6aad133116b6db7a0c481f48360 /test/requests/links_scraper/genelinks.py
parent: b07861ddd9805903f8b8ac509d68718481c61c3b (diff)
download: genenetwork2-06986d1c3a9d76e9e4b0f0d2a7c89746a5a19990.tar.gz
1 files changed, 92 insertions, 0 deletions
diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
new file mode 100644
index 00000000..223a2c31
--- /dev/null
+++ b/test/requests/links_scraper/genelinks.py
@@ -0,0 +1,92 @@
+import re
+import requests
+import urllib3
+import os
+import logging
+
+from urllib.request import urlopen as uReq
+from bs4 import BeautifulSoup as soup
+from urllib.parse import urljoin
+
+
+PORT = os.environ.get("PORT", "5004")
+
+
+def test_link(link, strict=True):
+    print(f"link testing {link}")
+    results = None
+    try:
+
+        results = requests.get(link, verify=False, timeout=10)
+
+    except Exception as e:
+        if strict:
+            raise SystemExit(
+                "The link does not exists or is wrongly formatted")
+        else:
+            logging.error(f"FAILED:{link} does not exists or is wrongly formatted")
+
+    status_code = results.status_code if results is not None else "404"
+
+    print(f'the link {link} ---> {status_code}')
+
+
+def fetch_css_links(parsed_page):
+    print("fetching css links")
+    for link in parsed_page.findAll("link"):
+        full_path = None
+
+        link_url = link.attrs.get("href")
+        if re.match(r"^http://", link_url):
+            pass
+            # not sure whether to raise an error here for external css links
+
+        elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url):
+            full_path = urljoin('http://localhost:5004/', link_url)
+
+        if full_path is not None:
+            test_link(full_path)
+
+
+def fetch_html_links(parsed_page):
+    print("fetching a tags ")
+
+    for link in parsed_page.findAll("a"):
+        full_path = None
+        link_url = link.attrs.get("href")
+        if re.match(r"^/", link_url):
+            full_path = urljoin('http://localhost:5004/', link_url)
+
+        elif re.match(r'^http://', link_url):
+            full_path = link_url
+
+        if full_path is not None:
+            test_link(full_path)
+
+
+def fetch_script_tags(parsed_page):
+    print("--->fetching js links")
+    for link in parsed_page.findAll("script"):
+        js_link = link.attrs.get("src")
+        if js_link is not None:
+            if re.match(r'^http://', js_link):
+                raise SystemExit("Failed,the library should be packaged in guix.\
+                                Please contact,http://genenetwork.org/ for more details")
+
+            elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link):
+                full_path = urljoin('http://localhost:5004/', js_link)
+                test_link(full_path)
+
+
+def fetch_page_links(page_url):
+
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+    html_page = uReq(page_url)
+    parsed_page = soup(html_page, "html.parser")
+
+    fetch_script_tags(parsed_page=parsed_page)
+    fetch_css_links(parsed_page=parsed_page)
+    fetch_html_links(parsed_page=parsed_page)
+
+
+fetch_page_links(f"http://localhost:{PORT}/")
author	Alexander Kabui	2021-02-09 23:28:26 +0300
committer	BonfaceKilz	2021-02-13 16:55:08 +0300
commit	06986d1c3a9d76e9e4b0f0d2a7c89746a5a19990 (patch)
tree	6cf4e1b104bdf6aad133116b6db7a0c481f48360 /test/requests/links_scraper/genelinks.py
parent	b07861ddd9805903f8b8ac509d68718481c61c3b (diff)
download	genenetwork2-06986d1c3a9d76e9e4b0f0d2a7c89746a5a19990.tar.gz