From 06986d1c3a9d76e9e4b0f0d2a7c89746a5a19990 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 9 Feb 2021 23:28:26 +0300 Subject: add check for broken links in genenetwork homepage --- test/requests/links_scraper/genelinks.py | 92 ++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 test/requests/links_scraper/genelinks.py (limited to 'test/requests/links_scraper/genelinks.py') diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py new file mode 100644 index 00000000..223a2c31 --- /dev/null +++ b/test/requests/links_scraper/genelinks.py @@ -0,0 +1,92 @@ +import re +import requests +import urllib3 +import os +import logging + +from urllib.request import urlopen as uReq +from bs4 import BeautifulSoup as soup +from urllib.parse import urljoin + + +PORT = os.environ.get("PORT", "5004") + + +def test_link(link, strict=True): + print(f"link testing {link}") + results = None + try: + + results = requests.get(link, verify=False, timeout=10) + + except Exception as e: + if strict: + raise SystemExit( + "The link does not exists or is wrongly formatted") + else: + logging.error(f"FAILED:{link} does not exists or is wrongly formatted") + + status_code = results.status_code if results is not None else "404" + + print(f'the link {link} ---> {status_code}') + + +def fetch_css_links(parsed_page): + print("fetching css links") + for link in parsed_page.findAll("link"): + full_path = None + + link_url = link.attrs.get("href") + if re.match(r"^http://", link_url): + pass + # not sure whether to raise an error here for external css links + + elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url): + full_path = urljoin('http://localhost:5004/', link_url) + + if full_path is not None: + test_link(full_path) + + +def fetch_html_links(parsed_page): + print("fetching a tags ") + + for link in parsed_page.findAll("a"): + full_path = None + link_url = link.attrs.get("href") + if re.match(r"^/", link_url): + full_path = urljoin('http://localhost:5004/', link_url) + + elif re.match(r'^http://', link_url): + full_path = link_url + + if full_path is not None: + test_link(full_path) + + +def fetch_script_tags(parsed_page): + print("--->fetching js links") + for link in parsed_page.findAll("script"): + js_link = link.attrs.get("src") + if js_link is not None: + if re.match(r'^http://', js_link): + raise SystemExit("Failed,the library should be packaged in guix.\ + Please contact,http://genenetwork.org/ for more details") + + elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link): + full_path = urljoin('http://localhost:5004/', js_link) + test_link(full_path) + + +def fetch_page_links(page_url): + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + html_page = uReq(page_url) + parsed_page = soup(html_page, "html.parser") + + fetch_script_tags(parsed_page=parsed_page) + fetch_css_links(parsed_page=parsed_page) + fetch_html_links(parsed_page=parsed_page) + + +fetch_page_links(f"http://localhost:{PORT}/") -- cgit v1.2.3 From e1c3827d65a35d87e45cefe04135c1ff16374410 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 03:26:49 +0300 Subject: add links validator --- test/requests/links_scraper/genelinks.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'test/requests/links_scraper/genelinks.py') diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 223a2c31..5dddcc47 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -7,11 +7,20 @@ import logging from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup from urllib.parse import urljoin +from urllib.parse import urlparse PORT = os.environ.get("PORT", "5004") +def is_valid_link(url_link): + try: + result = urlparse(url_link) + return all([result.scheme, result.netloc, result.path]) + except Exception as e: + return False + + def test_link(link, strict=True): print(f"link testing {link}") results = None @@ -57,7 +66,8 @@ def fetch_html_links(parsed_page): if re.match(r"^/", link_url): full_path = urljoin('http://localhost:5004/', link_url) - elif re.match(r'^http://', link_url): + elif is_valid_link(link_url): + print(link_url) full_path = link_url if full_path is not None: -- cgit v1.2.3 From 0cda88d6112ae0e4ae9ef5d4491a2d5695a07330 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 08:16:33 +0300 Subject: add check for url validity --- test/requests/links_scraper/genelinks.py | 61 ++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 18 deletions(-) (limited to 'test/requests/links_scraper/genelinks.py') diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 5dddcc47..3b8ce230 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -12,6 +12,8 @@ from urllib.parse import urlparse PORT = os.environ.get("PORT", "5004") +BROKEN_LINKS = set() + def is_valid_link(url_link): try: @@ -21,23 +23,21 @@ def is_valid_link(url_link): return False -def test_link(link, strict=True): - print(f"link testing {link}") +def test_link(link): + print(f'Checking -->{link}') results = None try: results = requests.get(link, verify=False, timeout=10) + status_code = results.status_code except Exception as e: - if strict: - raise SystemExit( - "The link does not exists or is wrongly formatted") - else: - logging.error(f"FAILED:{link} does not exists or is wrongly formatted") + status_code = 408 - status_code = results.status_code if results is not None else "404" + if int(status_code) > 403: + return True - print(f'the link {link} ---> {status_code}') + return False def fetch_css_links(parsed_page): @@ -46,15 +46,15 @@ def fetch_css_links(parsed_page): full_path = None link_url = link.attrs.get("href") - if re.match(r"^http://", link_url): - pass - # not sure whether to raise an error here for external css links + if is_valid_link(link_url): + full_path = link_url elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url): full_path = urljoin('http://localhost:5004/', link_url) if full_path is not None: - test_link(full_path) + if test_link(full_path): + BROKEN_LINKS.add(full_path) def fetch_html_links(parsed_page): @@ -67,11 +67,11 @@ def fetch_html_links(parsed_page): full_path = urljoin('http://localhost:5004/', link_url) elif is_valid_link(link_url): - print(link_url) full_path = link_url if full_path is not None: - test_link(full_path) + if test_link(full_path): + BROKEN_LINKS.add(full_path) def fetch_script_tags(parsed_page): @@ -79,13 +79,14 @@ def fetch_script_tags(parsed_page): for link in parsed_page.findAll("script"): js_link = link.attrs.get("src") if js_link is not None: - if re.match(r'^http://', js_link): + if is_valid_link(js_link): raise SystemExit("Failed,the library should be packaged in guix.\ Please contact,http://genenetwork.org/ for more details") elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link): full_path = urljoin('http://localhost:5004/', js_link) - test_link(full_path) + if test_link(full_path): + BROKEN_LINKS.add(full_path) def fetch_page_links(page_url): @@ -99,4 +100,28 @@ def fetch_page_links(page_url): fetch_html_links(parsed_page=parsed_page) -fetch_page_links(f"http://localhost:{PORT}/") +def webpages_to_check(): + pages = [ + + "http://localhost:/5004", + + + + + + + ] + + return pages + + +if __name__ == '__main__': + for page in webpages_to_check(): + fetch_page_links(f"http://localhost:{PORT}/") + if BROKEN_LINKS is not None: + print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>") + for link in BROKEN_LINKS: + print(link) + + raise SystemExit( + "The links Above are broken.Please contact genenetwork.org<<<<<<<<") -- cgit v1.2.3 From 5f4ad48a7afaca3cf34266c1012efedb7ef2ec46 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 08:43:37 +0300 Subject: pep8 formatting --- test/requests/links_scraper/genelinks.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'test/requests/links_scraper/genelinks.py') diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 3b8ce230..00a71d57 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -101,23 +101,14 @@ def fetch_page_links(page_url): def webpages_to_check(): - pages = [ - - "http://localhost:/5004", - - - - - - - ] + pages = [f"http://localhost:{PORT}/"] return pages if __name__ == '__main__': for page in webpages_to_check(): - fetch_page_links(f"http://localhost:{PORT}/") + fetch_page_links(page) if BROKEN_LINKS is not None: print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>") for link in BROKEN_LINKS: -- cgit v1.2.3 From f46991d6751efaac1687c12a74a92d913d61ac54 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 22:32:56 +0300 Subject: modify code for link_checker --- test/requests/links_scraper/genelinks.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'test/requests/links_scraper/genelinks.py') diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 00a71d57..ca98f62f 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -109,10 +109,11 @@ def webpages_to_check(): if __name__ == '__main__': for page in webpages_to_check(): fetch_page_links(page) - if BROKEN_LINKS is not None: + if len(BROKEN_LINKS) > 0: print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>") for link in BROKEN_LINKS: print(link) - raise SystemExit( - "The links Above are broken.Please contact genenetwork.org<<<<<<<<") + if len(BROKEN_LINKS) > 0: + raise SystemExit( + "The links Above are broken.Please contact genenetwork.org<<<<<<<<") -- cgit v1.2.3 From b53a8362ba1d3031ece2deefdc3309823b932012 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Feb 2021 23:11:45 +0300 Subject: add search for templates --- test/requests/links_scraper/genelinks.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'test/requests/links_scraper/genelinks.py') diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index ca98f62f..6a3d363e 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -11,10 +11,25 @@ from urllib.parse import urlparse PORT = os.environ.get("PORT", "5004") +TEMPLATE_PATH = "../wqflask/wqflask/templates" BROKEN_LINKS = set() +def search_templates(): + """searches for broken links in templates""" + html_parsed_pages = [] + for subdir, dirs, files in os.walk(TEMPLATE_PATH): + for file in files: + file_path = os.path.join(subdir, file) + if file_path.endswith(".html"): + parsed_page = soup( + open(file_path, encoding="utf8"), "html.parser") + html_parsed_pages.append(parsed_page) + + return html_parsed_pages + + def is_valid_link(url_link): try: result = urlparse(url_link) @@ -107,6 +122,8 @@ def webpages_to_check(): if __name__ == '__main__': + # results = search_templates() + for page in webpages_to_check(): fetch_page_links(page) if len(BROKEN_LINKS) > 0: -- cgit v1.2.3 From 187415f223b101f8c0b0ac100b2cf8e19c0ad3a5 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Sat, 13 Feb 2021 08:56:36 +0300 Subject: refactor to use idiomatic python --- test/requests/links_scraper/genelinks.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'test/requests/links_scraper/genelinks.py') diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 6a3d363e..12300f4a 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -49,10 +49,7 @@ def test_link(link): except Exception as e: status_code = 408 - if int(status_code) > 403: - return True - - return False + return int(status_code) > 403 def fetch_css_links(parsed_page): -- cgit v1.2.3