add check for url validity

author: Alexander Kabui 2021-02-11 08:16:33 +0300
committer: BonfaceKilz 2021-02-13 16:55:08 +0300
commit: 0cda88d6112ae0e4ae9ef5d4491a2d5695a07330 (patch)
tree: f81ef020ada834d6424daaa2cbc0a88eda0a8926 /test/requests/links_scraper
parent: 6f3273429482dea1aaaa4e5fe61b178485e271b8 (diff)
download: genenetwork2-0cda88d6112ae0e4ae9ef5d4491a2d5695a07330.tar.gz
1 files changed, 43 insertions, 18 deletions
diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
index 5dddcc47..3b8ce230 100644
--- a/test/requests/links_scraper/genelinks.py
+++ b/test/requests/links_scraper/genelinks.py
@@ -12,6 +12,8 @@ from urllib.parse import urlparse
 
 PORT = os.environ.get("PORT", "5004")
 
+BROKEN_LINKS = set()
+
 
 def is_valid_link(url_link):
     try:
@@ -21,23 +23,21 @@ def is_valid_link(url_link):
         return False
 
 
-def test_link(link, strict=True):
-    print(f"link testing {link}")
+def test_link(link):
+    print(f'Checking -->{link}')
     results = None
     try:
 
         results = requests.get(link, verify=False, timeout=10)
+        status_code = results.status_code
 
     except Exception as e:
-        if strict:
-            raise SystemExit(
-                "The link does not exists or is wrongly formatted")
-        else:
-            logging.error(f"FAILED:{link} does not exists or is wrongly formatted")
+        status_code = 408
 
-    status_code = results.status_code if results is not None else "404"
+    if int(status_code) > 403:
+        return True
 
-    print(f'the link {link} ---> {status_code}')
+    return False
 
 
 def fetch_css_links(parsed_page):
@@ -46,15 +46,15 @@ def fetch_css_links(parsed_page):
         full_path = None
 
         link_url = link.attrs.get("href")
-        if re.match(r"^http://", link_url):
-            pass
-            # not sure whether to raise an error here for external css links
+        if is_valid_link(link_url):
+            full_path = link_url
 
         elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url):
             full_path = urljoin('http://localhost:5004/', link_url)
 
         if full_path is not None:
-            test_link(full_path)
+            if test_link(full_path):
+                BROKEN_LINKS.add(full_path)
 
 
 def fetch_html_links(parsed_page):
@@ -67,11 +67,11 @@ def fetch_html_links(parsed_page):
             full_path = urljoin('http://localhost:5004/', link_url)
 
         elif is_valid_link(link_url):
-            print(link_url)
             full_path = link_url
 
         if full_path is not None:
-            test_link(full_path)
+            if test_link(full_path):
+                BROKEN_LINKS.add(full_path)
 
 
 def fetch_script_tags(parsed_page):
@@ -79,13 +79,14 @@ def fetch_script_tags(parsed_page):
     for link in parsed_page.findAll("script"):
         js_link = link.attrs.get("src")
         if js_link is not None:
-            if re.match(r'^http://', js_link):
+            if is_valid_link(js_link):
                 raise SystemExit("Failed,the library should be packaged in guix.\
                                 Please contact,http://genenetwork.org/ for more details")
 
             elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link):
                 full_path = urljoin('http://localhost:5004/', js_link)
-                test_link(full_path)
+                if test_link(full_path):
+                    BROKEN_LINKS.add(full_path)
 
 
 def fetch_page_links(page_url):
@@ -99,4 +100,28 @@ def fetch_page_links(page_url):
     fetch_html_links(parsed_page=parsed_page)
 
 
-fetch_page_links(f"http://localhost:{PORT}/")
+def webpages_to_check():
+    pages = [
+
+        "http://localhost:/5004",
+
+
+
+
+
+
+    ]
+
+    return pages
+
+
+if __name__ == '__main__':
+    for page in webpages_to_check():
+        fetch_page_links(f"http://localhost:{PORT}/")
+        if BROKEN_LINKS is not None:
+            print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>")
+            for link in BROKEN_LINKS:
+                print(link)
+
+            raise SystemExit(
+                "The links Above are broken.Please contact genenetwork.org<<<<<<<<")
author	Alexander Kabui	2021-02-11 08:16:33 +0300
committer	BonfaceKilz	2021-02-13 16:55:08 +0300
commit	0cda88d6112ae0e4ae9ef5d4491a2d5695a07330 (patch)
tree	f81ef020ada834d6424daaa2cbc0a88eda0a8926 /test/requests/links_scraper
parent	6f3273429482dea1aaaa4e5fe61b178485e271b8 (diff)
download	genenetwork2-0cda88d6112ae0e4ae9ef5d4491a2d5695a07330.tar.gz