搜尋本地pdf檔案內容

园糯發表於2024-12-04
import fitz  # PyMuPDF
import re
from pathlib import Path
from colorama import Fore
import sys
import os
 
def search_pdf(pdf_path, format,keywords,context_len=10):
	relp=os.path.relpath(pdf_path,'C:/Users/tellw')
	if not os.path.exists(relp):
		ds,filename=relp.rsplit('\\',1)
		if not os.path.exists(ds):
			os.makedirs(ds)
		if format=='pdf':
			with open(relp,'w',encoding='utf8') as f:
				# 開啟PDF檔案
				document = fitz.open(pdf_path)
				# 搜尋PDF中的文字
				for page_num in range(len(document)):
					page = document[page_num]
					text = re.sub(r'\s','',page.get_text()).lower()
					f.write(text)

				# 關閉PDF文件
				document.close()
		elif format=='txt':
			with open(pdf_path,'r',encoding='utf8') as f:
				text=f.read()
			with open(relp,'w',encoding='utf8') as f:
				f.write(re.sub(r'\s','',text).lower()) # \s匹配任意的空白符,包括空格,製表符(Tab),換行符,中文全形空格等。
	with open(relp,'r',encoding='utf8') as f:
		text=f.read()
	search_re='.{0,20}'.join(keywords)
	search_res=re.findall(f'.{{0,{context_len}}}{search_re}.{{0,{context_len}}}',text)
	for sr in search_res:
		for kw in keywords:
			sr=re.sub(kw,f'{Fore.RED}{kw}{Fore.BLACK}',sr)
		print(sr+'\t\t\t\t'+str(pdf_path)+'\n')

if len(sys.argv)>=2:
	keywords=sys.argv[1:]
else:
	sys.exit(1)
dirs=['C:/Users/tellw/open_title/file_updates','C:/Users/tellw/open_title/papers/benchmark','C:/Users/tellw/open_title/papers/edge_computing','C:/Users/tellw/open_title/papers/guidance','C:/Users/tellw/open_title/papers/methodology','C:/Users/tellw/open_title/papers/misc','C:/Users/tellw/open_title/papers/other-themes-benchmark','C:/Users/tellw/open_title/papers/speech_recognition','C:/Users/tellw/open_title/papers/test','C:/Users/tellw/open_title/papers/to_c','C:/Users/tellw/open_title/papers/books']
pdf_file_paths=[]
txt_file_paths=[]
for d in dirs:
	pdf_file_paths.extend(list(Path(d).glob('*.pdf')))
for d in dirs:
	txt_file_paths.extend(list(Path(d).glob('*.txt')))
os.chdir('C:/Users/tellw/open_title/paper_search_space')
context_len=30
for pdf_file_path in pdf_file_paths:
	search_pdf(pdf_file_path, 'pdf',keywords,context_len)
for txt_file_path in txt_file_paths:
	search_pdf(txt_file_path,'txt',keywords,context_len)

首先由百度gpt給出搜尋pdf檔案中關鍵字的程式碼,後在其上改進。搜尋關鍵詞作為指令碼的引數,['搜','索','內','容'],首先找到目標文件————pdf檔案和txt檔案,在其對應的搜尋空間裡,去掉原始檔中的空格、換行符、大小寫形式等與搜尋結果展示無關的內容,在搜尋空間裡按照正規表示式.{0,30}搜.{0,20}索.{0,20}內.{0,20}容.{0,30}搜尋目標字串

建立於2404061003,修改於2412042104

相關文章