python提取pdf文件中的链接代码
代码语言:python
所属分类:其他
代码描述:python提取pdf文件中的链接代码,主要使用了PyMuPDF和pikepdf库,非常方便
下面为部分代码预览,完整代码请点击下载或在bfwstudio webide中打开
import pikepdf # pip3 install pikepdf file = "/data/wwwroot/default/asset/bfw.pdf" # file = "1710.05006.pdf" pdf_file = pikepdf.Pdf.open(file) urls = [] # iterate over PDF pages for page in pdf_file.pages: for annots in page.get("/Annots"): uri = annots.get("/A").get("/URI") if uri is not None: print("[+] 找到链接:", uri) urls.append(uri) print("[*] Total URLs extracted:", len(urls)) import fitz # pip install PyMuPDF import re # a regular expression of URLs url_regex = r"https?:\/\/(www\.)?[-a-zA-.........完整代码请登录后点击上方下载按钮下载查看
网友评论0