about summary refs log tree commit diff
path: root/test/requests/links_scraper
diff options
context:
space:
mode:
authorArthur Centeno2021-10-25 21:04:23 +0000
committerArthur Centeno2021-10-25 21:04:23 +0000
commit499a80f138030c4de1629c043c8f9401a99894ea (patch)
tree449dcae965d13f966fb6d52625fbc86661c8c6a0 /test/requests/links_scraper
parent6151faa9ea67af4bf4ea95fb681a9dc4319474b6 (diff)
parent700802303e5e8221a9d591ba985d6607aa61e1ce (diff)
downloadgenenetwork2-499a80f138030c4de1629c043c8f9401a99894ea.tar.gz
Merge github.com:genenetwork/genenetwork2 into acenteno
Diffstat (limited to 'test/requests/links_scraper')
-rw-r--r--test/requests/links_scraper/genelinks.py121
1 files changed, 121 insertions, 0 deletions
diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
new file mode 100644
index 00000000..52c13489
--- /dev/null
+++ b/test/requests/links_scraper/genelinks.py
@@ -0,0 +1,121 @@
+import re
+import requests
+import urllib3
+import os
+
+from urllib.request import urlopen as uReq
+from bs4 import BeautifulSoup as soup
+from urllib.parse import urljoin
+from urllib.parse import urlparse
+
+
+PORT = os.environ.get("PORT", "5004")
+TEMPLATE_PATH = "../wqflask/wqflask/templates"
+
+BROKEN_LINKS = set()
+
+
+def search_templates():
+    """searches for broken links in templates"""
+    html_parsed_pages = []
+    for subdir, dirs, files in os.walk(TEMPLATE_PATH):
+        for file in files:
+            file_path = os.path.join(subdir, file)
+            if file_path.endswith(".html"):
+                parsed_page = soup(
+                    open(file_path, encoding="utf8"), "html.parser")
+                html_parsed_pages.append(parsed_page)
+    return html_parsed_pages
+
+
+def is_valid_link(url_link):
+    try:
+        result = urlparse(url_link)
+        return all([result.scheme, result.netloc, result.path])
+    except Exception:
+        return False
+
+
+def test_link(link):
+    print(f'Checking -->{link}')
+    results = None
+    try:
+        results = requests.get(link, verify=False, timeout=10)
+        status_code = results.status_code
+    except Exception:
+        status_code = 408
+    return int(status_code) > 403
+
+
+def fetch_css_links(parsed_page):
+    print("fetching css links")
+    for link in parsed_page.findAll("link"):
+        full_path = None
+        link_url = link.attrs.get("href")
+        if is_valid_link(link_url):
+            full_path = link_url
+        elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url):
+            full_path = urljoin('http://localhost:5004/', link_url)
+        if full_path is not None:
+            if test_link(full_path):
+                BROKEN_LINKS.add(full_path)
+
+
+def fetch_html_links(parsed_page):
+    print("fetching a tags ")
+    for link in parsed_page.findAll("a"):
+        full_path = None
+        link_url = link.attrs.get("href")
+        if re.match(r"^/", link_url):
+            full_path = urljoin('http://localhost:5004/', link_url)
+        elif is_valid_link(link_url):
+            full_path = link_url
+        if full_path is not None:
+            if test_link(full_path):
+                BROKEN_LINKS.add(full_path)
+
+
+def fetch_script_tags(parsed_page):
+    print("--->fetching js links")
+    for link in parsed_page.findAll("script"):
+        js_link = link.attrs.get("src")
+        if js_link is not None:
+            if is_valid_link(js_link):
+                raise SystemExit("Failed,the library should be "
+                                 "packaged in guix. "
+                                 "Please contact, "
+                                 "http://genenetwork.org/ "
+                                 "for more details")
+
+            elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link):
+                full_path = urljoin('http://localhost:5004/', js_link)
+                if test_link(full_path):
+                    BROKEN_LINKS.add(full_path)
+
+
+def fetch_page_links(page_url):
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+    html_page = uReq(page_url)
+    parsed_page = soup(html_page, "html.parser")
+    fetch_script_tags(parsed_page=parsed_page)
+    fetch_css_links(parsed_page=parsed_page)
+    fetch_html_links(parsed_page=parsed_page)
+
+
+def webpages_to_check():
+    pages = [f"http://localhost:{PORT}/"]
+    return pages
+
+
+if __name__ == '__main__':
+    for page in webpages_to_check():
+        fetch_page_links(page)
+        if len(BROKEN_LINKS) > 0:
+            print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>")
+            for link in BROKEN_LINKS:
+                print(link)
+
+    if len(BROKEN_LINKS) > 0:
+        raise SystemExit(
+            "The links Above are broken. "
+            "Please contact genenetwork.org<<<<<<<<")