From 06986d1c3a9d76e9e4b0f0d2a7c89746a5a19990 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 9 Feb 2021 23:28:26 +0300
Subject: add check for broken links in genenetwork homepage

---
 test/requests/links_scraper/genelinks.py | 92 ++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 test/requests/links_scraper/genelinks.py

(limited to 'test/requests')

diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
new file mode 100644
index 00000000..223a2c31
--- /dev/null
+++ b/test/requests/links_scraper/genelinks.py
@@ -0,0 +1,92 @@
+import re
+import requests
+import urllib3
+import os
+import logging
+
+from urllib.request import urlopen as uReq
+from bs4 import BeautifulSoup as soup
+from urllib.parse import urljoin
+
+
+PORT = os.environ.get("PORT", "5004")
+
+
+def test_link(link, strict=True):
+    print(f"link testing {link}")
+    results = None
+    try:
+
+        results = requests.get(link, verify=False, timeout=10)
+
+    except Exception as e:
+        if strict:
+            raise SystemExit(
+                "The link does not exists or is wrongly formatted")
+        else:
+            logging.error(f"FAILED:{link} does not exists or is wrongly formatted")
+
+    status_code = results.status_code if results is not None else "404"
+
+    print(f'the link {link} ---> {status_code}')
+
+
+def fetch_css_links(parsed_page):
+    print("fetching css links")
+    for link in parsed_page.findAll("link"):
+        full_path = None
+
+        link_url = link.attrs.get("href")
+        if re.match(r"^http://", link_url):
+            pass
+            # not sure whether to raise an error here for external css links
+
+        elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url):
+            full_path = urljoin('http://localhost:5004/', link_url)
+
+        if full_path is not None:
+            test_link(full_path)
+
+
+def fetch_html_links(parsed_page):
+    print("fetching a tags ")
+
+    for link in parsed_page.findAll("a"):
+        full_path = None
+        link_url = link.attrs.get("href")
+        if re.match(r"^/", link_url):
+            full_path = urljoin('http://localhost:5004/', link_url)
+
+        elif re.match(r'^http://', link_url):
+            full_path = link_url
+
+        if full_path is not None:
+            test_link(full_path)
+
+
+def fetch_script_tags(parsed_page):
+    print("--->fetching js links")
+    for link in parsed_page.findAll("script"):
+        js_link = link.attrs.get("src")
+        if js_link is not None:
+            if re.match(r'^http://', js_link):
+                raise SystemExit("Failed,the library should be packaged in guix.\
+                                Please contact,http://genenetwork.org/ for more details")
+
+            elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link):
+                full_path = urljoin('http://localhost:5004/', js_link)
+                test_link(full_path)
+
+
+def fetch_page_links(page_url):
+
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+    html_page = uReq(page_url)
+    parsed_page = soup(html_page, "html.parser")
+
+    fetch_script_tags(parsed_page=parsed_page)
+    fetch_css_links(parsed_page=parsed_page)
+    fetch_html_links(parsed_page=parsed_page)
+
+
+fetch_page_links(f"http://localhost:{PORT}/")
-- 
cgit v1.2.3


From e1c3827d65a35d87e45cefe04135c1ff16374410 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Feb 2021 03:26:49 +0300
Subject: add links validator

---
 test/requests/links_scraper/genelinks.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'test/requests')

diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
index 223a2c31..5dddcc47 100644
--- a/test/requests/links_scraper/genelinks.py
+++ b/test/requests/links_scraper/genelinks.py
@@ -7,11 +7,20 @@ import logging
 from urllib.request import urlopen as uReq
 from bs4 import BeautifulSoup as soup
 from urllib.parse import urljoin
+from urllib.parse import urlparse
 
 
 PORT = os.environ.get("PORT", "5004")
 
 
+def is_valid_link(url_link):
+    try:
+        result = urlparse(url_link)
+        return all([result.scheme, result.netloc, result.path])
+    except Exception as e:
+        return False
+
+
 def test_link(link, strict=True):
     print(f"link testing {link}")
     results = None
@@ -57,7 +66,8 @@ def fetch_html_links(parsed_page):
         if re.match(r"^/", link_url):
             full_path = urljoin('http://localhost:5004/', link_url)
 
-        elif re.match(r'^http://', link_url):
+        elif is_valid_link(link_url):
+            print(link_url)
             full_path = link_url
 
         if full_path is not None:
-- 
cgit v1.2.3


From 0cda88d6112ae0e4ae9ef5d4491a2d5695a07330 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Feb 2021 08:16:33 +0300
Subject: add check for url validity

---
 test/requests/links_scraper/genelinks.py | 61 ++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 18 deletions(-)

(limited to 'test/requests')

diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
index 5dddcc47..3b8ce230 100644
--- a/test/requests/links_scraper/genelinks.py
+++ b/test/requests/links_scraper/genelinks.py
@@ -12,6 +12,8 @@ from urllib.parse import urlparse
 
 PORT = os.environ.get("PORT", "5004")
 
+BROKEN_LINKS = set()
+
 
 def is_valid_link(url_link):
     try:
@@ -21,23 +23,21 @@ def is_valid_link(url_link):
         return False
 
 
-def test_link(link, strict=True):
-    print(f"link testing {link}")
+def test_link(link):
+    print(f'Checking -->{link}')
     results = None
     try:
 
         results = requests.get(link, verify=False, timeout=10)
+        status_code = results.status_code
 
     except Exception as e:
-        if strict:
-            raise SystemExit(
-                "The link does not exists or is wrongly formatted")
-        else:
-            logging.error(f"FAILED:{link} does not exists or is wrongly formatted")
+        status_code = 408
 
-    status_code = results.status_code if results is not None else "404"
+    if int(status_code) > 403:
+        return True
 
-    print(f'the link {link} ---> {status_code}')
+    return False
 
 
 def fetch_css_links(parsed_page):
@@ -46,15 +46,15 @@ def fetch_css_links(parsed_page):
         full_path = None
 
         link_url = link.attrs.get("href")
-        if re.match(r"^http://", link_url):
-            pass
-            # not sure whether to raise an error here for external css links
+        if is_valid_link(link_url):
+            full_path = link_url
 
         elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url):
             full_path = urljoin('http://localhost:5004/', link_url)
 
         if full_path is not None:
-            test_link(full_path)
+            if test_link(full_path):
+                BROKEN_LINKS.add(full_path)
 
 
 def fetch_html_links(parsed_page):
@@ -67,11 +67,11 @@ def fetch_html_links(parsed_page):
             full_path = urljoin('http://localhost:5004/', link_url)
 
         elif is_valid_link(link_url):
-            print(link_url)
             full_path = link_url
 
         if full_path is not None:
-            test_link(full_path)
+            if test_link(full_path):
+                BROKEN_LINKS.add(full_path)
 
 
 def fetch_script_tags(parsed_page):
@@ -79,13 +79,14 @@ def fetch_script_tags(parsed_page):
     for link in parsed_page.findAll("script"):
         js_link = link.attrs.get("src")
         if js_link is not None:
-            if re.match(r'^http://', js_link):
+            if is_valid_link(js_link):
                 raise SystemExit("Failed,the library should be packaged in guix.\
                                 Please contact,http://genenetwork.org/ for more details")
 
             elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link):
                 full_path = urljoin('http://localhost:5004/', js_link)
-                test_link(full_path)
+                if test_link(full_path):
+                    BROKEN_LINKS.add(full_path)
 
 
 def fetch_page_links(page_url):
@@ -99,4 +100,28 @@ def fetch_page_links(page_url):
     fetch_html_links(parsed_page=parsed_page)
 
 
-fetch_page_links(f"http://localhost:{PORT}/")
+def webpages_to_check():
+    pages = [
+
+        "http://localhost:/5004",
+
+
+
+
+
+
+    ]
+
+    return pages
+
+
+if __name__ == '__main__':
+    for page in webpages_to_check():
+        fetch_page_links(f"http://localhost:{PORT}/")
+        if BROKEN_LINKS is not None:
+            print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>")
+            for link in BROKEN_LINKS:
+                print(link)
+
+            raise SystemExit(
+                "The links Above are broken.Please contact genenetwork.org<<<<<<<<")
-- 
cgit v1.2.3


From 5f4ad48a7afaca3cf34266c1012efedb7ef2ec46 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Feb 2021 08:43:37 +0300
Subject: pep8 formatting

---
 test/requests/links_scraper/genelinks.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'test/requests')

diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
index 3b8ce230..00a71d57 100644
--- a/test/requests/links_scraper/genelinks.py
+++ b/test/requests/links_scraper/genelinks.py
@@ -101,23 +101,14 @@ def fetch_page_links(page_url):
 
 
 def webpages_to_check():
-    pages = [
-
-        "http://localhost:/5004",
-
-
-
-
-
-
-    ]
+    pages = [f"http://localhost:{PORT}/"]
 
     return pages
 
 
 if __name__ == '__main__':
     for page in webpages_to_check():
-        fetch_page_links(f"http://localhost:{PORT}/")
+        fetch_page_links(page)
         if BROKEN_LINKS is not None:
             print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>")
             for link in BROKEN_LINKS:
-- 
cgit v1.2.3


From f46991d6751efaac1687c12a74a92d913d61ac54 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Feb 2021 22:32:56 +0300
Subject: modify code for link_checker

---
 test/requests/links_scraper/genelinks.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'test/requests')

diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
index 00a71d57..ca98f62f 100644
--- a/test/requests/links_scraper/genelinks.py
+++ b/test/requests/links_scraper/genelinks.py
@@ -109,10 +109,11 @@ def webpages_to_check():
 if __name__ == '__main__':
     for page in webpages_to_check():
         fetch_page_links(page)
-        if BROKEN_LINKS is not None:
+        if len(BROKEN_LINKS) > 0:
             print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>")
             for link in BROKEN_LINKS:
                 print(link)
 
-            raise SystemExit(
-                "The links Above are broken.Please contact genenetwork.org<<<<<<<<")
+    if len(BROKEN_LINKS) > 0:
+        raise SystemExit(
+            "The links Above are broken.Please contact genenetwork.org<<<<<<<<")
-- 
cgit v1.2.3


From b53a8362ba1d3031ece2deefdc3309823b932012 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Feb 2021 23:11:45 +0300
Subject: add search for templates

---
 test/requests/links_scraper/genelinks.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'test/requests')

diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
index ca98f62f..6a3d363e 100644
--- a/test/requests/links_scraper/genelinks.py
+++ b/test/requests/links_scraper/genelinks.py
@@ -11,10 +11,25 @@ from urllib.parse import urlparse
 
 
 PORT = os.environ.get("PORT", "5004")
+TEMPLATE_PATH = "../wqflask/wqflask/templates"
 
 BROKEN_LINKS = set()
 
 
+def search_templates():
+    """searches for broken links in templates"""
+    html_parsed_pages = []
+    for subdir, dirs, files in os.walk(TEMPLATE_PATH):
+        for file in files:
+            file_path = os.path.join(subdir, file)
+            if file_path.endswith(".html"):
+                parsed_page = soup(
+                    open(file_path, encoding="utf8"), "html.parser")
+                html_parsed_pages.append(parsed_page)
+
+    return html_parsed_pages
+
+
 def is_valid_link(url_link):
     try:
         result = urlparse(url_link)
@@ -107,6 +122,8 @@ def webpages_to_check():
 
 
 if __name__ == '__main__':
+    # results = search_templates()
+
     for page in webpages_to_check():
         fetch_page_links(page)
         if len(BROKEN_LINKS) > 0:
-- 
cgit v1.2.3


From 187415f223b101f8c0b0ac100b2cf8e19c0ad3a5 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Sat, 13 Feb 2021 08:56:36 +0300
Subject: refactor to use idiomatic python

---
 test/requests/links_scraper/genelinks.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'test/requests')

diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
index 6a3d363e..12300f4a 100644
--- a/test/requests/links_scraper/genelinks.py
+++ b/test/requests/links_scraper/genelinks.py
@@ -49,10 +49,7 @@ def test_link(link):
     except Exception as e:
         status_code = 408
 
-    if int(status_code) > 403:
-        return True
-
-    return False
+    return int(status_code) > 403
 
 
 def fetch_css_links(parsed_page):
-- 
cgit v1.2.3