aboutsummaryrefslogtreecommitdiff
path: root/test/requests/links_scraper
diff options
context:
space:
mode:
authorzsloan2021-10-18 17:50:26 +0000
committerzsloan2021-10-18 17:50:26 +0000
commite36eaf0003a598bc5aa688803dd1b36c24a4c051 (patch)
treea59b7dadf02241575eb0774f97c6048e2425c053 /test/requests/links_scraper
parentbd421438f1f0b4de913fa40cd49cfcda27e6b16f (diff)
parent04f3d13aceeaec2e52b94037d59f08ed6dc6a8bb (diff)
downloadgenenetwork2-e36eaf0003a598bc5aa688803dd1b36c24a4c051.tar.gz
Merge branch 'testing' of github.com:genenetwork/genenetwork2 into feature/remove_trait_creation_from_search
Diffstat (limited to 'test/requests/links_scraper')
-rw-r--r--test/requests/links_scraper/genelinks.py30
1 files changed, 9 insertions, 21 deletions
diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
index 12300f4a..52c13489 100644
--- a/test/requests/links_scraper/genelinks.py
+++ b/test/requests/links_scraper/genelinks.py
@@ -2,7 +2,6 @@ import re
import requests
import urllib3
import os
-import logging
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
@@ -26,7 +25,6 @@ def search_templates():
parsed_page = soup(
open(file_path, encoding="utf8"), "html.parser")
html_parsed_pages.append(parsed_page)
-
return html_parsed_pages
@@ -34,7 +32,7 @@ def is_valid_link(url_link):
try:
result = urlparse(url_link)
return all([result.scheme, result.netloc, result.path])
- except Exception as e:
+ except Exception:
return False
@@ -42,13 +40,10 @@ def test_link(link):
print(f'Checking -->{link}')
results = None
try:
-
results = requests.get(link, verify=False, timeout=10)
status_code = results.status_code
-
- except Exception as e:
+ except Exception:
status_code = 408
-
return int(status_code) > 403
@@ -56,14 +51,11 @@ def fetch_css_links(parsed_page):
print("fetching css links")
for link in parsed_page.findAll("link"):
full_path = None
-
link_url = link.attrs.get("href")
if is_valid_link(link_url):
full_path = link_url
-
elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url):
full_path = urljoin('http://localhost:5004/', link_url)
-
if full_path is not None:
if test_link(full_path):
BROKEN_LINKS.add(full_path)
@@ -71,16 +63,13 @@ def fetch_css_links(parsed_page):
def fetch_html_links(parsed_page):
print("fetching a tags ")
-
for link in parsed_page.findAll("a"):
full_path = None
link_url = link.attrs.get("href")
if re.match(r"^/", link_url):
full_path = urljoin('http://localhost:5004/', link_url)
-
elif is_valid_link(link_url):
full_path = link_url
-
if full_path is not None:
if test_link(full_path):
BROKEN_LINKS.add(full_path)
@@ -92,8 +81,11 @@ def fetch_script_tags(parsed_page):
js_link = link.attrs.get("src")
if js_link is not None:
if is_valid_link(js_link):
- raise SystemExit("Failed,the library should be packaged in guix.\
- Please contact,http://genenetwork.org/ for more details")
+ raise SystemExit("Failed,the library should be "
+ "packaged in guix. "
+ "Please contact, "
+ "http://genenetwork.org/ "
+ "for more details")
elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link):
full_path = urljoin('http://localhost:5004/', js_link)
@@ -102,11 +94,9 @@ def fetch_script_tags(parsed_page):
def fetch_page_links(page_url):
-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
html_page = uReq(page_url)
parsed_page = soup(html_page, "html.parser")
-
fetch_script_tags(parsed_page=parsed_page)
fetch_css_links(parsed_page=parsed_page)
fetch_html_links(parsed_page=parsed_page)
@@ -114,13 +104,10 @@ def fetch_page_links(page_url):
def webpages_to_check():
pages = [f"http://localhost:{PORT}/"]
-
return pages
if __name__ == '__main__':
- # results = search_templates()
-
for page in webpages_to_check():
fetch_page_links(page)
if len(BROKEN_LINKS) > 0:
@@ -130,4 +117,5 @@ if __name__ == '__main__':
if len(BROKEN_LINKS) > 0:
raise SystemExit(
- "The links Above are broken.Please contact genenetwork.org<<<<<<<<")
+ "The links Above are broken. "
+ "Please contact genenetwork.org<<<<<<<<")