0_originJson_writeURL_inTxt.py:
import urllib.request
import json
origin_json_data = urllib.request.urlopen(r"http://www.txwz.qq.com/lib/index.php?m=enterprise&a=get_exsample").read()
ndata = json.loads(origin_json_data) # ndata is decode_json_data
file_name = "f:/2017-05-16.txt" # "/" "\" 或不寫 都可以
#這個是你放網址的檔名,改過來就可以了
file_open = open(file_name, "a") # 把提取出的資訊寫入file_name.txt檔案
for i in range( 0,len(ndata["data"]) ):
type_data = ndata["data"][i]["bn"]
url_data = ndata["data"][i]["n"] #從解碼後的json資料中提取type、url資訊
if "http" not in url_data:
url_data = "http://" + url_data #將字首沒有http;//的url加上協議字首
file_open.write(url_data + "\n") #將已經提取的資訊寫入txt檔案
file_open.close() #寫入完畢
print("txt_write finish")
1_txt_openableUrl_saveInTxt.py:
import urllib.request
import time
import requests #抓取並儲存網頁原始碼要用的包
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/49.0.2')]
file = open('f:/1.txt') #這個是你想檢測能否開啟的url所在檔名,改過來就可以了
lines = file.readlines()
aa=[]
for line in lines:
temp=line.replace('\n','')
aa.append(temp)
print(aa)
print('開始檢查:')
count = 0 # 計算txt中能開啟的網站的數量
#newfile = open("f:/URL_open.txt","a") #這個是你儲存能開啟網址的檔名,改過來就可以了
for a in aa:
tempUrl = a
try :
opener.open(tempUrl)
print(tempUrl+'沒問題')
newfile = open("f:/URL_open.txt","a")
newfile.write(tempUrl+"\n") #將能開啟的url寫入f:/URL_open.txt
newfile.close()
count = count + 1
html = requests.get(tempUrl)
tempfileName = "f:/" + str(count) + "_" + str(tempUrl[7:]) + ".txt"
f = open(tempfileName,'a',encoding='utf-8')
f.write(html.text)
f.close() #將能開啟的url寫入f:/URL_open.txt後 並且抓取其網頁原始碼並儲存至f:/count_url.txt中count、url都隨之變化
except urllib.error.HTTPError:
print(tempUrl+'=訪問頁面出錯')
time.sleep(2)
except urllib.error.URLError:
print(tempUrl+'=訪問頁面出錯')
time.sleep(2)
time.sleep(0.1)
#newfile.close()
print("txt_openableUrl_saveInTxt-------->finish")