import fitz # PyMuPDF
import re
from pathlib import Path
from colorama import Fore
import sys
import os
def search_pdf(pdf_path, format,keywords,context_len=10):
relp=os.path.relpath(pdf_path,'C:/Users/tellw')
if not os.path.exists(relp):
ds,filename=relp.rsplit('\\',1)
if not os.path.exists(ds):
os.makedirs(ds)
if format=='pdf':
with open(relp,'w',encoding='utf8') as f:
# 開啟PDF檔案
document = fitz.open(pdf_path)
# 搜尋PDF中的文字
for page_num in range(len(document)):
page = document[page_num]
text = re.sub(r'\s','',page.get_text()).lower()
f.write(text)
# 關閉PDF文件
document.close()
elif format=='txt':
with open(pdf_path,'r',encoding='utf8') as f:
text=f.read()
with open(relp,'w',encoding='utf8') as f:
f.write(re.sub(r'\s','',text).lower()) # \s匹配任意的空白符,包括空格,製表符(Tab),換行符,中文全形空格等。
with open(relp,'r',encoding='utf8') as f:
text=f.read()
search_re='.{0,20}'.join(keywords)
search_res=re.findall(f'.{{0,{context_len}}}{search_re}.{{0,{context_len}}}',text)
for sr in search_res:
for kw in keywords:
sr=re.sub(kw,f'{Fore.RED}{kw}{Fore.BLACK}',sr)
print(sr+'\t\t\t\t'+str(pdf_path)+'\n')
if len(sys.argv)>=2:
keywords=sys.argv[1:]
else:
sys.exit(1)
dirs=['C:/Users/tellw/open_title/file_updates','C:/Users/tellw/open_title/papers/benchmark','C:/Users/tellw/open_title/papers/edge_computing','C:/Users/tellw/open_title/papers/guidance','C:/Users/tellw/open_title/papers/methodology','C:/Users/tellw/open_title/papers/misc','C:/Users/tellw/open_title/papers/other-themes-benchmark','C:/Users/tellw/open_title/papers/speech_recognition','C:/Users/tellw/open_title/papers/test','C:/Users/tellw/open_title/papers/to_c','C:/Users/tellw/open_title/papers/books']
pdf_file_paths=[]
txt_file_paths=[]
for d in dirs:
pdf_file_paths.extend(list(Path(d).glob('*.pdf')))
for d in dirs:
txt_file_paths.extend(list(Path(d).glob('*.txt')))
os.chdir('C:/Users/tellw/open_title/paper_search_space')
context_len=30
for pdf_file_path in pdf_file_paths:
search_pdf(pdf_file_path, 'pdf',keywords,context_len)
for txt_file_path in txt_file_paths:
search_pdf(txt_file_path,'txt',keywords,context_len)
首先由百度gpt給出搜尋pdf檔案中關鍵字的程式碼,後在其上改進。搜尋關鍵詞作為指令碼的引數,['搜','索','內','容']
,首先找到目標文件————pdf檔案和txt檔案,在其對應的搜尋空間裡,去掉原始檔中的空格、換行符、大小寫形式等與搜尋結果展示無關的內容,在搜尋空間裡按照正規表示式.{0,30}搜.{0,20}索.{0,20}內.{0,20}容.{0,30}
搜尋目標字串
建立於2404061003,修改於2412042104