我不知道現在有多少人在用網盤搜尋引擎,但就去轉盤網來說本人傾注了很多的心血,現在使用的人數也還可以,網盤資源都有個通病,那就是資源可能失效,但很多引擎都沒有做失效判斷,尤其是一些google自定義的引擎,技術含量不高,站長也就花心思賺錢,很少考慮使用者體驗。這篇文章是本人又一篇技術公開部落格,之前本人已經公開了去轉盤
網的幾乎所有的技術細節,這一篇繼續補充:
首先做個回顧:百度網盤爬蟲 java分詞演算法 資料庫自動備份 代理伺服器爬取 邀請好友註冊
ing:utf-8
"""
@author:haoning
@create time:2015.8.5
"""
from __future__ import division # 精確除法
from Queue import Queue
from __builtin__ import False
from _sqlite3 import SQLITE_ALTER_TABLE
from collections import OrderedDict
import copy
import datetime
import json
import math
import os
import random
import platform
import re
import threading, errno, datetime
import time
import urllib2
import MySQLdb as mdb
DB_HOST = `127.0.0.1`
DB_USER = `root`
DB_PASS = `root`
def gethtml(url):
try:
print "url",url
req = urllib2.Request(url)
response = urllib2.urlopen(req,None,8) #在這裡應該加入代理
html = response.read()
return html
except Exception,e:
print "e",e
if __name__ == `__main__`:
while 1:
#url=`http://pan.baidu.com/share/link?uk=1813251526&shareid=540167442`
url="http://pan.baidu.com/s/1qXQD2Pm"
html=gethtml(url)
print html
結果:e HTTP Error 403: Forbidden,這就是說,度娘他是反爬蟲的,之後看了很多網站,一不小心試了下面的連結:
http://pan.baidu.com/share/li…
if __name__ == `__main__`:
while 1:
url=`http://pan.baidu.com/share/link?uk=1813251526&shareid=540167442`
#url="http://pan.baidu.com/s/1qXQD2Pm"
html=gethtml(url)
print html
結果:<title>百度雲 網盤-連結不存在</title>,你懂的,有這個的必然已經失效,看來度娘沒有反爬蟲,好傢伙。
其實百度網盤的資源入口有兩種方式:
一種是:http://pan.baidu.com/s/1qXQD2Pm,最後為短碼。
另一種是:http://pan.baidu.com/share/li…,關鍵是shareId+uk 前者已知道反爬蟲,後者目前沒有,所以用python測試後,本人又將程式碼翻譯成了java,因為去轉盤是用java寫的,直接上程式碼:
package com.tray.common.utils;
import static org.junit.Assert.*;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.Set;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.junit.Test;
/**
* 資源校驗工具
*
* @author hui
*
*/
public class ResourceCheckUtil {
private static Map<String, String[]> rules;
static {
loadRule();
}
/**
* 載入規則庫
*/
public static void loadRule() {
try {
InputStream in = ResourceCheckUtil.class.getClassLoader()
.getResourceAsStream("rule.properties");
Properties p = new Properties();
p.load(in);
Set<Object> keys = p.keySet();
Iterator<Object> iterator = keys.iterator();
String key = null;
String value = null;
String[] rule = null;
rules = new HashMap<String, String[]>();
while (iterator.hasNext()) {
key = (String) iterator.next();
value = (String) p.get(key);
rule = value.split("\|");
rules.put(key, rule);
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static String httpRequest(String url) {
try {
URL u = new URL(url);
Random random = new Random();
HttpURLConnection connection = (HttpURLConnection) u
.openConnection();
connection.setConnectTimeout(3000);//3秒超時
connection.setReadTimeout(3000);
connection.setDoOutput(true);
connection.setDoInput(true);
connection.setUseCaches(false);
connection.setRequestMethod("GET");
String[] user_agents = {
"Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11",
"Opera/9.25 (Windows NT 5.1; U; en)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12",
"Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
};
int index=random.nextInt(7);
/*connection.setRequestProperty("Content-Type",
"text/html;charset=UTF-8");*/
connection.setRequestProperty("User-Agent",user_agents[index]);
/*connection.setRequestProperty("Accept-Encoding","gzip, deflate, sdch");
connection.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8");
connection.setRequestProperty("Connection","keep-alive");
connection.setRequestProperty("Host","pan.baidu.com");
connection.setRequestProperty("Cookie","");
connection.setRequestProperty("Upgrade-Insecure-Requests","1");*/
InputStream in = connection.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(in,
"utf-8"));
StringBuffer sb = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null) {
sb.append(line);
}
return sb.toString();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
@Test
public void test7() throws Exception {
System.out.println(isExistResource("http://pan.baidu.com/s/1jGjBmyq",
"baidu"));
System.out.println(isExistResource("http://pan.baidu.com/s/1jGjBmyqa",
"baidu"));
System.out.println(isExistResource("http://yunpan.cn/cQx6e6xv38jTd","360"));
System.out.println(isExistResource("http://yunpan.cn/cQx6e6xv38jTdd",
"360"));
System.out.println(isExistResource("http://share.weiyun.com/ec4f41f0da292adb89a745200b8e8b57","weiyun"));
System.out.println(isExistResource("http://share.weiyun.com/ec4f41f0da292adb89a745200b8e8b57dd",
"360"));
System.out.println(isExistResource("http://cloud.letv.com/s/eiGLzuSes","leshi"));
System.out.println(isExistResource("http://cloud.letv.com/s/eiGLzuSesdd",
"leshi"));
}
/**
* 獲取指定頁面上標籤的內容
*
* @param url
* @param tagName
* 標籤名稱
* @return
*/
private static String getHtmlContent(String url, String tagName) {
String html = httpRequest(url);
if(html==null){
return "";
}
Document doc = Jsoup.parse(html);
//System.out.println("doc======"+doc);
Elements tag=null;
if(tagName.equals("<h3>")){ //針對微雲
tag=doc.select("h3");
}
else if(tagName.equals("class")){ //針對360
tag=doc.select("div[class=tip]");
}
else{
tag= doc.getElementsByTag(tagName);
}
//System.out.println("tag======"+tag);
String content="";
if(tag!=null&&!tag.isEmpty()){
content = tag.get(0).text();
}
return content;
}
public static int isExistResource(String url, String ruleName) {
try {
String[] rule = rules.get(ruleName);
String tagName = rule[0];
String opt = rule[1];
String flag = rule[2];
/*System.out.println("ruleName"+ruleName);
System.out.println("tagName"+tagName);
System.out.println("opt"+opt);
System.out.println("flag"+flag);
System.out.println("url"+url);*/
String content = getHtmlContent(url, tagName);
//System.out.println("content="+content);
if(ruleName.equals("baidu")){
if(content.contains("百度雲升級")){ //升級作為不存在處理
return 1;
}
}
String regex = null;
if ("eq".equals(opt)) {
regex = "^" + flag + "$";
} else if ("bg".equals(opt)) {
regex = "^" + flag + ".*$";
} else if ("ed".equals(opt)) {
regex = "^.*" + flag + "$";
} else if ("like".equals(opt)) {
regex = "^.*" + flag + ".*$";
}else if("contain".equals(opt)){
if(content.contains(flag)){
return 0;
}
else{
return 1;
}
}
if(content.matches(regex)){
return 1;
}
} catch (Exception e) {
e.printStackTrace();
}
return 0;
}
// public static void main(String[] args)throws Exception {
// final Path p = Paths.get("C:/Users/hui/Desktop/6-14/");
// final WatchService watchService =
// FileSystems.getDefault().newWatchService();
// p.register(watchService, StandardWatchEventKinds.ENTRY_MODIFY);
// new Thread(new Runnable() {
//
// public void run() {
// while(true){
// System.out.println("檢測中。。。。");
// try {
// WatchKey watchKey = watchService.take();
// List<WatchEvent<?>> watchEvents = watchKey.pollEvents();
//
// for(WatchEvent<?> event : watchEvents){
// //TODO 根據事件型別採取不同的操作。。。。。。。
// System.out.println("["+p.getFileName()+"/"+event.context()+"]檔案發生了["+event.kind()+"]事件");
// }
// watchKey.reset();
//
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
// }
// }).start();
// }
// @Test
// public void testName() throws Exception {
// System.out.println(new String("u8BF7u8F93u5165u63D0u53D6u7801".getBytes("utf-8"), "utf-8"));
// }
}
注意程式碼本生要用來相容360,微盤等網盤的,但有些網盤倒了,大家都知道,不過程式碼還是得在,這才是程式猿該有的思路,那就是可寬展,注意程式碼有個配置檔案,我也附上吧:
360=class|contain|u5206u4EABu8005u5DF2u53D6u6D88u6B64u5206u4EAB
baidu=title|contain|u94FEu63A5u4E0Du5B58u5728
weiyun=<h3>|contain|u5206u4EABu8D44u6E90u5DF2u7ECFu5220u9664
leshi=title|ed|u63D0u53D6u6587u4EF6
sorry,unicode編碼,麻煩你自己轉下碼吧,不會請百度:unicode轉碼工具
到此,去轉盤網連結是否失效的驗證,程式碼我已經完全公開,喜歡這篇部落格的孩子請收藏並關注下。
本人建個qq群,歡迎大家一起交流技術, 群號:512245829 喜歡微博的朋友關注:轉盤娛樂即可