1、貼出主要程式碼。這個不是python,python只涉及了服務端對資訊提取結果的接受。主體是java + android + js。由於淘寶各模組都是二級子域名,不能只在一個頁面完成所有請求,ajax不能跨域。需要載入不同的頁面。以下是主要部分。js內容使用服務端分發。
這樣做好處,即使不使用微服務,單臺機器也能滿足1000個使用者在同一分鐘提交賬號密碼請求登入,簡化後臺編寫複雜度和減小伺服器壓力。密碼 驗證碼的校驗也更及時。
2、不是爬自己的資訊,是獲取別人 任意賬號 + 密碼的淘寶個人資訊,如果是為了拿到自己的資訊,搞這麼多七七八八的那是閒的蛋疼。具體賬號 密碼是哪來的,置頂第一篇有介紹。
package com.touna.crawlmodule; import android.graphics.Bitmap; import android.net.http.SslError; import android.support.v7.app.AppCompatActivity; import android.os.Bundle; import android.util.Log; import android.view.View; import android.webkit.CookieManager; import android.webkit.JavascriptInterface; import android.webkit.SslErrorHandler; import android.webkit.ValueCallback; import android.webkit.WebChromeClient; import android.webkit.WebSettings; import android.webkit.WebView; import android.webkit.WebViewClient; import org.json.JSONObject; import com.xx.httprequest.CrawlResultSender; import com.xx.view.LogUtil; import com.xx.view.ViewUtil; import com.xx.view.WebViewTimer; public class TaobaoActivity extends AppCompatActivity { private static final String TAG = "MainActivity"; private static final String LOGINPAGEURL = "https://login.m.taobao.com/login.htm";//移動端登陸頁面 private static final String MOBILEINDEXPAGEURL = "http://h5.m.taobao.com/mlapp/mytaobao.html";//移動端淘寶個人使用者首頁 private static final String PCINDEXPAGEURL = "https://www.taobao.com/"; private static final String BINDPAGEURL = "http://member1.taobao.com/member/fresh/account_management.htm"; private static final String COLLECTIONURL = "https://shoucang.taobao.com/nodejs/item_collect_chunk.htm";//收藏頁面url private static final String ADDRESSURL = "https://member1.taobao.com/member/fresh/deliver_address.htm";//收貨地址url private static final String MYPATHURL = "https://lu.taobao.com/newMyPath.htm";//我的足跡url private static final String BOUGHTSHOPSURL = "https://favorite.taobao.com/list_bought_shops_n.htm";//已經購買的店鋪 private static final String BOUGHTITEMSURL = "https://buyertrade.taobao.com/trade/itemlist/list_bought_items.htm";//已經購買的物品 private static final String SHOPCARTURL = "https://cart.taobao.com/cart.htm";//購物車URL private static final String SAFESETTINGURL = "http://member1.taobao.com/member/fresh/certify_info.htm";//安全資訊設定 private static final String TRADEINFOURL = "http://member1.taobao.com/member/fresh/account_profile.htm";//交易資訊url private static final String PERSONALINFOURL = "https://i.taobao.com/user/baseInfoSet.htm";//個人資料url private static final String POINTSURL = "https://pages.tmall.com/wow/jifen/act/point-details";//積分URL private static final String WEIBOURL = "http://member1.taobao.com/member/fresh/weibo_bind_management.htm";//繫結微博URL private static final String REFUSEURL = "https://refund2.tmall.com/dispute/buyerDisputeList.htm?type=1&disputeType=1";//退貨管理URL private static final String HUABEIURL = "https://i.taobao.com/my_taobao.htm";//支付寶餘額和花唄額度 private JSONObject dataJson=new JSONObject(); @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_taobo); startWebView(); } private void startWebView() { WebView webView = findViewById(R.id.taobaoView); final WebSettings settings = webView.getSettings(); settings.setUseWideViewPort(true); settings.setLayoutAlgorithm(WebSettings.LayoutAlgorithm.NARROW_COLUMNS); settings.setLoadWithOverviewMode(true); settings.setJavaScriptEnabled(true); webView.addJavascriptInterface(new JsInterface(), "JsInterface"); settings.setJavaScriptEnabled(true); settings.setLoadWithOverviewMode(true); settings.setSupportZoom(true); settings.setDomStorageEnabled(true); settings.setCacheMode(WebSettings.LOAD_NO_CACHE); settings.setAllowFileAccess(true); settings.setUseWideViewPort(true); settings.setSupportMultipleWindows(true); settings.setLoadsImagesAutomatically(true); //settings.setBlockNetworkImage(false); settings.setDefaultTextEncodingName("GBK"); webView.setVerticalScrollBarEnabled(true); webView.setHorizontalScrollBarEnabled(true); settings.setUserAgentString("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); webView.setWebChromeClient(new WebChromeClient()); startWebViewClient(webView); webView.loadUrl(LOGINPAGEURL); } /** * @param view WebView物件 * 初始化webviewClient */ private void startWebViewClient(WebView view) { view.setWebViewClient(new WebViewClient() { @Override public void onReceivedSslError(WebView view, SslErrorHandler handler, SslError error) { handler.proceed(); } @Override public void onPageStarted(final WebView view, String url, Bitmap favicon) { Log.e(TAG, "onPageStarted: " + url); if (url.contains(LOGINPAGEURL)){ view.setVisibility(View.GONE); } } /** * @param view 瀏覽器物件 * @param url 瀏覽器地址 */ @Override public void onPageFinished(final WebView view, String url) { Log.e(TAG, "onPageFinished: " + url); if (url.contains(LOGINPAGEURL)) { ViewUtil.injectScriptFile(view, "loginPage/taobaoInit.js"); view.loadUrl("javascript:initLoginPage()"); new WebViewTimer(view, 300){ @Override public void operateView(){ view.setVisibility(View.VISIBLE); } }; } if (url.contains(MOBILEINDEXPAGEURL)) { //view.getSettings().setUserAgentString("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); ViewUtil.setNoImage(view); //關閉圖片 view.loadUrl(PCINDEXPAGEURL); } if (PCINDEXPAGEURL.equals(url)) { view.loadUrl(REFUSEURL); } if (url.contains(REFUSEURL)) { ViewUtil.injectScriptFile(view, "jquery.min.js"); //此處需要jquery! ViewUtil.injectScriptFromInternet(view, "taobao/refund.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractRefund());"); view.loadUrl(POINTSURL); } if (url.contains(POINTSURL)) { Log.e(TAG, "onPageFinished: inject"); ViewUtil.injectScriptFromInternet(view, "taobao/point.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractPoint());"); view.loadUrl(PERSONALINFOURL); } if (url.contains(PERSONALINFOURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/personalInformation.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractPersonalInformation());"); view.loadUrl(COLLECTIONURL); } if (url.contains(COLLECTIONURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/collect.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractCollect());"); view.loadUrl(ADDRESSURL); } if (url.contains(ADDRESSURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/delivery.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractDelivery());"); view.loadUrl(MYPATHURL); } if (url.contains(MYPATHURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/footprint.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractFootprint());"); view.loadUrl(BOUGHTSHOPSURL); } if (url.contains(BOUGHTSHOPSURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/havaboughtStore.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractHaveBoughtStore());"); view.loadUrl(BOUGHTITEMSURL); } if (url.contains(BOUGHTITEMSURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/havebought.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractHaveBought());"); view.loadUrl(SHOPCARTURL); } if (url.contains(SHOPCARTURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/shoppingCart.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractShoppingCart());"); view.loadUrl(SAFESETTINGURL); } if (url.contains(SAFESETTINGURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/safeSettings.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractSafeSettings());"); view.loadUrl(TRADEINFOURL); } if (url.contains(TRADEINFOURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/tradeInfo.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractTradeInfo());"); view.loadUrl(WEIBOURL); } if (url.contains(WEIBOURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/weibo.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractWeibo());"); view.loadUrl(BINDPAGEURL); } if (url.contains(BINDPAGEURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/alipayBinding.js"); view.loadUrl("javascript:window.JsInterface.getReturnString(extractAlipay());"); view.loadUrl(HUABEIURL); } if (url.contains(HUABEIURL)) { ViewUtil.injectScriptFromInternet(view, "taobao/huabei.js"); view.loadUrl("javascript:clickHuabei1()"); new WebViewTimer(view, 2000){ @Override public void operateView(){ view.loadUrl("javascript:clickHuabei2()"); } }; new WebViewTimer(view, 4000){ @Override public void operateView(){ view.evaluateJavascript("extractHuabei()", new ValueCallback<String>() { @Override public void onReceiveValue(String s) { Log.e(TAG, "onReceiveValue: "+s ); String jsonStr = ViewUtil.getStrLikeJson(s); ViewUtil.reconsituteJSon(jsonStr, dataJson); ViewUtil.showLargeLog(dataJson.toString()); CrawlResultSender.sendToweb("taobao", dataJson.toString()); } }); } }; } } }); } class JsInterface { private static final String TAG = "JSInterface"; @JavascriptInterface public void getReturnString(String returnValue) throws Exception{ Log.e(TAG,"當前項返回值是: " + returnValue); ViewUtil.reconsituteJSon(returnValue,dataJson); } } }
貼出其中一個js例項,例如提取使用者所收藏物品。這裡不是用直接翻頁,使用的是ajax以提升效率,ajax一定需要同步方式。由於此介面是返回的頁面不是json,可以用css選擇器。
1 /** 2 * Created by ㄟ(▔=▔)ㄏ on 2018/1/5. 3 */ 4 /* 5 * https://shoucang.taobao.com/nodejs/item_collect_chunk.htm?ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow=0 6 * 提取收藏的寶貝 7 * */ 8 function myajax(opt) { 9 opt = opt || {}; 10 opt.type = opt.type.toUpperCase() || 'POST'; 11 opt.url = opt.url || ''; 12 opt.async = opt.async || false; 13 opt.data = opt.data || null; 14 opt.success = opt.success || function () {}; 15 var xmlHttp = null; 16 if (XMLHttpRequest) { 17 xmlHttp = new XMLHttpRequest(); 18 } 19 else { 20 xmlHttp = new ActiveXObject('Microsoft.XMLHTTP'); 21 } 22 var params = []; 23 for (var key in opt.data){ 24 params.push(key + '=' + opt.data[key]); 25 } 26 var postData = params.join('&'); 27 if (opt.type.toUpperCase() === 'POST') { 28 xmlHttp.open(opt.type, opt.url, opt.async); 29 xmlHttp.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded;charset=utf-8'); 30 xmlHttp.send(postData); 31 } 32 else if (opt.type.toUpperCase() === 'GET') { 33 xmlHttp.open(opt.type, opt.url + '?' + postData, opt.async); 34 xmlHttp.send(null); 35 } 36 return xmlHttp; 37 } 38 39 40 function extractCollect() { 41 42 var collectList = []; 43 function extractCollectInner(p) { 44 console.debug("當前是第 " + p + "頁"); 45 var p = p || 0; 46 var startRow = p*30 ; 47 var url = 'https://shoucang.taobao.com/nodejs/item_collect_chunk.htm?ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow='+ startRow; 48 var htmlObj = myajax({ 49 type: 'GET', 50 url: url , 51 async: false 52 }); 53 var htmlStr = htmlObj.responseText; 54 if (htmlStr.indexOf("J_FavListItem") > 0) { //判斷頁面是否為空不能繼續翻頁了 55 var collectSelectorList = document.querySelectorAll('li.J_FavListItem'); //使用原生js的querySelector css選擇器方法 56 for (var i=0; i< collectSelectorList.length; i++) { 57 console.debug(i); 58 var collectName = collectSelectorList[i].querySelector('a.img-item-title-link').title; 59 var collectUrl = collectSelectorList[i].querySelector('a.img-item-title-link').href; 60 var collectPriceElement = collectSelectorList[i].querySelector('.g_price strong'); 61 collectPriceElement ? collectPrice = collectPriceElement.innerText : collectPrice = "寶貝已失效"; //三元運算子,找不到價格元素,說明該寶貝已失效 62 var collectObj = {'collectName': collectName, 'collectUrl': collectUrl,'collectPrice':collectPrice}; 63 console.info(collectObj); 64 collectList.push(collectObj); 65 } 66 console.info(url); 67 if (p < 3){ //最多隻翻3頁,每頁30個收藏 68 extractCollectInner(p + 1); //翻頁回撥自己 69 } 70 } 71 return '{"collectInfo":' + JSON.stringify(collectList) + '}'; 72 } 73 74 return extractCollectInner(); 75 } 76 77 //extractCollect();
這就是唯一登入淘寶獲取資訊的方法,不管是什麼語言java py,不管是用httpclient urlconnection還是urllib requests 想達到 本篇的目的,可能性為0。不服不信的可以用httpclient urllib試試,光是一個介面登入淘寶,網上就在懸賞5萬人民幣了,就不說提取資訊了,單是把這個介面登入淘寶解決,相當於幾個月的工資了。