python抓取百度baidu搜索引擎结果返回干净的markdown'正文文本代码
代码语言:python
所属分类:web系统
代码描述:python抓取百度baidu搜索引擎结果返回干净的markdown'正文文本代码,利用selenium爬取搜索结果,然后利用jina提取百度的每个搜索链接,转换成干净的markdown正文,这个可作为本地ai搜索的一个api接口使用。
代码标签: python 抓取 百度 baidu 搜索 引擎 结果 返回 干净 markdown' 正文 文本
下面为部分代码预览,完整代码请点击下载或在bfwstudio webide中打开
#!/usr/local/python3/bin/python3 # -*- coding: utf-8 -* from selenium import webdriver from selenium.webdriver.common.keys import Keys from time import sleep from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import requests def fetch_json_from_baidu_link(baidu_link): # 发送请求并获取重定向后的URL response = requests.get(baidu_link, allow_redirects=True) # 获取重定向后的真实URL real_url = "https://r.jina.ai/" + response.url # 设置请求头 headers = { 'Accept': 'application/json' } try: # 发送GET请求 response = requests.get(real_url, headers=headers) # 检查请求是否成功(状态码200表示成功) response.raise_for_status() # 如果状态码不是200,会抛出异常 # 获取JSON数据 json_data = response.json() return json_data except requests.exceptions.HTTPError as http_err: print(f'HTTP error occurred: {http_err}') except requests.exceptions.ConnectionError as conn_err: print(f'Connection error occurred: {conn_err}') except requests.exceptions.Timeout as timeout_err: print(f'Timeout error occurred: {timeout_err}') except requests.exceptions.RequestException as req_err: print(f'An error occurred: {req_err}') return None chromeOptions = webdriver.ChromeOptions() chromeOptions.add_argument("--headless") chromeOptions.add_argument("--remote-debugging-port=9222") chromeOptions.add_argument('--no-sandbox') browser = webdriver.Chrome('/usr/bin/chromedriver.........完整代码请登录后点击上方下载按钮下载查看
网友评论0