由於個人需要國內所有省市區的ip地址段和經緯度的對應關係,最好是資料庫可以直接使用的資料來源,經調研發現可以使用百度API來實現這個功能,主要是找到所有的ip地址段然後呼叫api 查詢,
主要是用Tornado 做非同步爬蟲來實現的,
import time
import json
from datetime import timedelta
from bs4 import BeautifulSoup
import os
from tornado import httpclient, gen, ioloop, queues
from api import map_ip_region_provice
AK = "umG08TkpEwmRWkUPx4MaM0FGW3h4i4L" # 你可以使用百度開發者賬號申請
base_url = `http://api.map.baidu.com/location/ip?ak={1}&ip={0}`
IP_URL = `https://github.com/17mon/china_ip_list/blob/master/china_ip_list.txt`
concurrency = 20 # 併發數
def is_file_exist(result=None):
if result:
result = result
else:
result = []
with open(`iplist.txt`, `w+`) as ipfile:
for i in result:
ipfile.write(i)
ipfile.write(`
`)
return result
def get_iplist():
"""
從github 下載ip 段
"""
# check ip is exist, 節約時間
result = list()
flag = is_file_exist()
if flag:
return flag
else:
http_client = httpclient.HTTPClient()
try:
response = http_client.fetch(IP_URL)
respone = response.body
except httpclient.HTTPError as e:
print("Error: " + str(e))
except Exception as e:
# Other errors are possible, such as IOError.
print("Error: " + str(e))
http_client.close()
respone = BeautifulSoup(respone, `lxml`)
div = respone.find(`table`,
attrs={`class`,
`highlight tab-size js-file-line-container`})
trs = div.find_all(`td`, attrs={`class`: `blob-code blob-code-inner js-file-line`})
for tr in trs:
temp = tr.string
result.append(temp)
print(`fetch ip .....`)
is_file_exist(result)
具體可以檢視