1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
|
import re
import requests
import urllib3
import os
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from urllib.parse import urljoin
from urllib.parse import urlparse
PORT = os.environ.get("PORT", "5004")
TEMPLATE_PATH = "../wqflask/wqflask/templates"
BROKEN_LINKS = set()
def search_templates():
"""searches for broken links in templates"""
html_parsed_pages = []
for subdir, dirs, files in os.walk(TEMPLATE_PATH):
for file in files:
file_path = os.path.join(subdir, file)
if file_path.endswith(".html"):
parsed_page = soup(
open(file_path, encoding="utf8"), "html.parser")
html_parsed_pages.append(parsed_page)
return html_parsed_pages
def is_valid_link(url_link):
try:
result = urlparse(url_link)
return all([result.scheme, result.netloc, result.path])
except Exception:
return False
def test_link(link):
print(f'Checking -->{link}')
results = None
try:
results = requests.get(link, verify=False, timeout=10)
status_code = results.status_code
except Exception:
status_code = 408
return int(status_code) > 403
def fetch_css_links(parsed_page):
print("fetching css links")
for link in parsed_page.findAll("link"):
full_path = None
link_url = link.attrs.get("href")
if is_valid_link(link_url):
full_path = link_url
elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url):
full_path = urljoin('http://localhost:5004/', link_url)
if full_path is not None:
if test_link(full_path):
BROKEN_LINKS.add(full_path)
def fetch_html_links(parsed_page):
print("fetching a tags ")
for link in parsed_page.findAll("a"):
full_path = None
link_url = link.attrs.get("href")
if re.match(r"^/", link_url):
full_path = urljoin('http://localhost:5004/', link_url)
elif is_valid_link(link_url):
full_path = link_url
if full_path is not None:
if test_link(full_path):
BROKEN_LINKS.add(full_path)
def fetch_script_tags(parsed_page):
print("--->fetching js links")
for link in parsed_page.findAll("script"):
js_link = link.attrs.get("src")
if js_link is not None:
if is_valid_link(js_link):
raise SystemExit("Failed,the library should be "
"packaged in guix. "
"Please contact, "
"http://genenetwork.org/ "
"for more details")
elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link):
full_path = urljoin('http://localhost:5004/', js_link)
if test_link(full_path):
BROKEN_LINKS.add(full_path)
def fetch_page_links(page_url):
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
html_page = uReq(page_url)
parsed_page = soup(html_page, "html.parser")
fetch_script_tags(parsed_page=parsed_page)
fetch_css_links(parsed_page=parsed_page)
fetch_html_links(parsed_page=parsed_page)
def webpages_to_check():
pages = [f"http://localhost:{PORT}/"]
return pages
if __name__ == '__main__':
for page in webpages_to_check():
fetch_page_links(page)
if len(BROKEN_LINKS) > 0:
print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>")
for link in BROKEN_LINKS:
print(link)
if len(BROKEN_LINKS) > 0:
raise SystemExit(
"The links Above are broken. "
"Please contact genenetwork.org<<<<<<<<")
|