python爬取url网页中所有的链接地址代码
代码语言:python
所属分类:web系统
代码描述:python爬取url网页中所有的链接地址代码,并分内部链接和外部链接,最后保存到本地
下面为部分代码预览,完整代码请点击下载或在bfwstudio webide中打开
#!/usr/local/python3/bin/python3 # -*- coding: utf-8 -* import requests from urllib.request import urlparse, urljoin from bs4 import BeautifulSoup import colorama # init the colorama module colorama.init() GREEN = colorama.Fore.GREEN GRAY = colorama.Fore.LIGHTBLACK_EX RESET = colorama.Fore.RESET # initialize the set of links (unique links) internal_urls = set() external_urls = set() total_urls_visited = 0 def is_valid(url): """ Checks whether `url` is a valid URL. """ parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) def get_all_website_links(url): """ Returns all URLs that is found on `url` in which it belongs to the same website """ # all URLs of `url` urls = set() # domain name of the URL without the protocol domain_name = urlparse(url).netloc soup = BeautifulSoup(requests.get(url).content, "html.parser") for a_tag in soup.findAll("a"): href = a_tag.attrs.get("href") if href == "" or href is None: # href empty tag continue # join the URL if it's relative (not absolute link) href = urljoin(url, href) parsed_href = urlparse(href) # remove URL GET parameters, URL fragments, etc. href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path if not is_valid(href): # not a valid URL continue if href in internal_urls: # already in the set continue if domain_name not in href: # external link if href not in external_urls: print(f"{GRAY}[!] External link: {href}{RESET}") external_urls.add(href) continue print(f"{GREEN}[*] Internal link: {href}{RES.........完整代码请登录后点击上方下载按钮下载查看
网友评论0