aboutsummaryrefslogtreecommitdiff
path: root/test/requests/links_scraper
diff options
context:
space:
mode:
Diffstat (limited to 'test/requests/links_scraper')
-rw-r--r--test/requests/links_scraper/genelinks.py29
1 files changed, 9 insertions, 20 deletions
diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
index 13aee7c8..52c13489 100644
--- a/test/requests/links_scraper/genelinks.py
+++ b/test/requests/links_scraper/genelinks.py
@@ -25,7 +25,6 @@ def search_templates():
parsed_page = soup(
open(file_path, encoding="utf8"), "html.parser")
html_parsed_pages.append(parsed_page)
-
return html_parsed_pages
@@ -33,7 +32,7 @@ def is_valid_link(url_link):
try:
result = urlparse(url_link)
return all([result.scheme, result.netloc, result.path])
- except Exception as e:
+ except Exception:
return False
@@ -41,13 +40,10 @@ def test_link(link):
print(f'Checking -->{link}')
results = None
try:
-
results = requests.get(link, verify=False, timeout=10)
status_code = results.status_code
-
- except Exception as e:
+ except Exception:
status_code = 408
-
return int(status_code) > 403
@@ -55,14 +51,11 @@ def fetch_css_links(parsed_page):
print("fetching css links")
for link in parsed_page.findAll("link"):
full_path = None
-
link_url = link.attrs.get("href")
if is_valid_link(link_url):
full_path = link_url
-
elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url):
full_path = urljoin('http://localhost:5004/', link_url)
-
if full_path is not None:
if test_link(full_path):
BROKEN_LINKS.add(full_path)
@@ -70,16 +63,13 @@ def fetch_css_links(parsed_page):
def fetch_html_links(parsed_page):
print("fetching a tags ")
-
for link in parsed_page.findAll("a"):
full_path = None
link_url = link.attrs.get("href")
if re.match(r"^/", link_url):
full_path = urljoin('http://localhost:5004/', link_url)
-
elif is_valid_link(link_url):
full_path = link_url
-
if full_path is not None:
if test_link(full_path):
BROKEN_LINKS.add(full_path)
@@ -91,8 +81,11 @@ def fetch_script_tags(parsed_page):
js_link = link.attrs.get("src")
if js_link is not None:
if is_valid_link(js_link):
- raise SystemExit("Failed,the library should be packaged in guix.\
- Please contact,http://genenetwork.org/ for more details")
+ raise SystemExit("Failed,the library should be "
+ "packaged in guix. "
+ "Please contact, "
+ "http://genenetwork.org/ "
+ "for more details")
elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link):
full_path = urljoin('http://localhost:5004/', js_link)
@@ -101,11 +94,9 @@ def fetch_script_tags(parsed_page):
def fetch_page_links(page_url):
-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
html_page = uReq(page_url)
parsed_page = soup(html_page, "html.parser")
-
fetch_script_tags(parsed_page=parsed_page)
fetch_css_links(parsed_page=parsed_page)
fetch_html_links(parsed_page=parsed_page)
@@ -113,13 +104,10 @@ def fetch_page_links(page_url):
def webpages_to_check():
pages = [f"http://localhost:{PORT}/"]
-
return pages
if __name__ == '__main__':
- # results = search_templates()
-
for page in webpages_to_check():
fetch_page_links(page)
if len(BROKEN_LINKS) > 0:
@@ -129,4 +117,5 @@ if __name__ == '__main__':
if len(BROKEN_LINKS) > 0:
raise SystemExit(
- "The links Above are broken.Please contact genenetwork.org<<<<<<<<")
+ "The links Above are broken. "
+ "Please contact genenetwork.org<<<<<<<<")