diff options
Diffstat (limited to 'test/requests/links_scraper/genelinks.py')
-rw-r--r-- | test/requests/links_scraper/genelinks.py | 30 |
1 files changed, 9 insertions, 21 deletions
diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 12300f4a..52c13489 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -2,7 +2,6 @@ import re import requests import urllib3 import os -import logging from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup @@ -26,7 +25,6 @@ def search_templates(): parsed_page = soup( open(file_path, encoding="utf8"), "html.parser") html_parsed_pages.append(parsed_page) - return html_parsed_pages @@ -34,7 +32,7 @@ def is_valid_link(url_link): try: result = urlparse(url_link) return all([result.scheme, result.netloc, result.path]) - except Exception as e: + except Exception: return False @@ -42,13 +40,10 @@ def test_link(link): print(f'Checking -->{link}') results = None try: - results = requests.get(link, verify=False, timeout=10) status_code = results.status_code - - except Exception as e: + except Exception: status_code = 408 - return int(status_code) > 403 @@ -56,14 +51,11 @@ def fetch_css_links(parsed_page): print("fetching css links") for link in parsed_page.findAll("link"): full_path = None - link_url = link.attrs.get("href") if is_valid_link(link_url): full_path = link_url - elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url): full_path = urljoin('http://localhost:5004/', link_url) - if full_path is not None: if test_link(full_path): BROKEN_LINKS.add(full_path) @@ -71,16 +63,13 @@ def fetch_css_links(parsed_page): def fetch_html_links(parsed_page): print("fetching a tags ") - for link in parsed_page.findAll("a"): full_path = None link_url = link.attrs.get("href") if re.match(r"^/", link_url): full_path = urljoin('http://localhost:5004/', link_url) - elif is_valid_link(link_url): full_path = link_url - if full_path is not None: if test_link(full_path): BROKEN_LINKS.add(full_path) @@ -92,8 +81,11 @@ def fetch_script_tags(parsed_page): js_link = link.attrs.get("src") if js_link is not None: if is_valid_link(js_link): - raise SystemExit("Failed,the library should be packaged in guix.\ - Please contact,http://genenetwork.org/ for more details") + raise SystemExit("Failed,the library should be " + "packaged in guix. " + "Please contact, " + "http://genenetwork.org/ " + "for more details") elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link): full_path = urljoin('http://localhost:5004/', js_link) @@ -102,11 +94,9 @@ def fetch_script_tags(parsed_page): def fetch_page_links(page_url): - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) html_page = uReq(page_url) parsed_page = soup(html_page, "html.parser") - fetch_script_tags(parsed_page=parsed_page) fetch_css_links(parsed_page=parsed_page) fetch_html_links(parsed_page=parsed_page) @@ -114,13 +104,10 @@ def fetch_page_links(page_url): def webpages_to_check(): pages = [f"http://localhost:{PORT}/"] - return pages if __name__ == '__main__': - # results = search_templates() - for page in webpages_to_check(): fetch_page_links(page) if len(BROKEN_LINKS) > 0: @@ -130,4 +117,5 @@ if __name__ == '__main__': if len(BROKEN_LINKS) > 0: raise SystemExit( - "The links Above are broken.Please contact genenetwork.org<<<<<<<<") + "The links Above are broken. " + "Please contact genenetwork.org<<<<<<<<") |