Python 3.2 獲取中文網頁

fallingboats發表於2012-07-23

安裝BeautifulSoup http://www.crummy.com/software/BeautifulSoup/

編寫程式碼如下

import urllib.request
import sys
from bs4 import BeautifulSoup
from html.parser import HTMLParser
type = sys.getfilesystemencoding()
request = urllib.request.Request("http://www.baidu.com")
response = urllib.request.urlopen(request)
the_page = response.read()

//指定網頁編碼
parser = BeautifulSoup(the_page,from_encoding="utf-8")
print(parser.prettify(formatter="minimal"))


結果如下:

<!DOCTYPE doctype html>
<html>
 <head>
  <meta content="text/html;charset=utf-8" http-equiv="Content-Type">
   <title>
    百度一下,你就知道
   </title>
   <style>
    html{overflow-y:auto}body{font:12px arial;text-align:center;background:#fff}body,p,form,ul,li{margin:0;padding:0;list-style:none}body,form,#fm{position:relative}td{text-align:left}img{border:0}a{color:#00c}a:active{color:#f60}#u{color:#999;padding:4px 10px 5px 0;text-align:right}#u a{margin:0 5px}#u .reg{margin:0}#m{width:680px;margin:0 auto;}#nv a,#nv b,.btn,#lk{font-size:14px}#fm{padding-left:90px;text-align:left}input{border:0;padding:0}#nv{height:19px;font-size:16px;margin:0 0 4px;text-align:left;text-indent:117px;}.s_ipt_wr{width:418px;height:30px;display:inline-block;margin-right:5px;background:url(http://s1.bdstatic.com/r/www/img/i-1.0.0.png) no-repeat -304px 0;border:1px solid #b6b6b6;border-color:#9a9a9a #cdcdcd #cdcdcd #9a9a9a;vertical-align:top}.s_ipt{width:405px;height:22px;font:16px/22px arial;margin:5px 0 0 7px;background:#fff;outline:none;-webkit-appearance:none}.s_btn{width:95px;height:32px;padding-top:2px\9;font-size:14px;background:#ddd url(http://s1.bdstatic.com/r/www/img/i-1.0.0.png);cursor:pointer}.s_btn_h{background-position:-100px 0}.s_btn_wr{width:97px;height:34px;display:inline-block;background:url(http://s1.bdstatic.com/r/www/img/i-1.0.0.png) no-repeat -202px 0;*position:relative;z-index:0;vertical-align:top}#lg img{vertical-align:top;margin-bottom:3px}#lk{margin:33px 0}#lk span{font:14px "宋體"}#lm{height:60px}#lh{margin:16px 0 5px;word-spacing:3px}.tools{position:absolute;top:-4px;*top:10px;right:-13px;}#mHolder{width:62px;position:relative;z-index:296;display:none}#mCon{height:18px;line-height:18px;position:absolute;cursor:pointer;padding:0 18px 0 0;background:url(http://s1.bdstatic.com/r/www/img/bg-1.0.0.gif) no-repeat right -134px;background-position:right -136px\9}#mCon span{color:#00c;cursor:default;display:block}#mCon .hw{text-decoration:underline;cursor:pointer}#mMenu a{width:100%;height:100%;display:block;line-height:22px;text-indent:6px;text-decoration:none;filter:none\9}#mMenu,#user ul{box-shadow:1px 1px 2px #ccc;-moz-box-shadow:1px 1px 2px #ccc;-webkit-box-shadow:1px 1px 2px #ccc;filter: progid:DXImageTransform.Microsoft.Shadow(Strength=2, Direction=135, Color="#cccccc")\9;}#mMenu{width:56px;border:1px solid #9b9b9b;list-style:none;position:absolute;right:7px;top:28px;display:none;background:#fff}#mMenu a:hover{background:#ebebeb}#mMenu .ln{height:1px;background:#ebebeb;overflow:hidden;font-size:1px;line-height:1px;margin-top:-1px}#cp,#cp a{color:#77c}#seth{display:none;behavior:url(#default#homepage)}#setf{display:none;}#sekj{margin-left:14px;}
   </style>
   <script type="text/javascript">
    function h(obj){obj.style.behavior='url(#default#homepage)';var a = obj.setHomePage('http://www.baidu.com/');}
   </script>
  </meta>
 </head>
 <body>
  <div id="u">
   <a href="http://www.baidu.com/gaoji/preferences.html" name="tj_setting">
    搜尋設定
   </a>
   |
   <a href="https://passport.baidu.com/v2/?login&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2F" id="lb" name="tj_login" onclick="return false;">
    登入
   </a>
   <a class="reg" href="https://passport.baidu.com/v2/?reg&amp;regType=1&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2F" name="tj_reg" target="_blank">
    註冊
   </a>
  </div>
  <div id="m">
   <p id="lg">
    <img height="129" src="http://www.baidu.com/img/baidu_sylogo1.gif" usemap="#mp" width="270">
     <map name="mp">
      <area coords="40,25,230,95" href="http://hi.baidu.com/baidu/" shape="rect" target="_blank" title="點此進入 百度的空間">
      </area>
     </map>
    </img>
   </p>
   <p id="nv">
    <a href="http://news.baidu.com">
     新 聞
    </a>
    <b>
     網 頁
    </b>
    <a href="http://tieba.baidu.com">
     貼 吧
    </a>
    <a href="http://zhidao.baidu.com">
     知 道
    </a>
    <a href="http://mp3.baidu.com">
     MP3
    </a>
    <a href="http://image.baidu.com">
     圖 片
    </a>
    <a href="http://video.baidu.com">
     視 頻
    </a>
    <a href="http://map.baidu.com">
     地 圖
    </a>
   </p>
   <div id="fm">
    <form action="/s" name="f">
     <span class="s_ipt_wr">
      <input class="s_ipt" id="kw" maxlength="100" name="wd" type="text"/>
     </span>
     <input name="rsv_bp" type="hidden" value="0">
      <input name="rsv_spt" type="hidden" value="3">
       <span class="s_btn_wr">
        <input class="s_btn" id="su" onmousedown="this.className='s_btn s_btn_h'" onmouseout="this.className='s_btn'" type="submit" value="百度一下"/>
       </span>
      </input>
     </input>
    </form>
    <span class="tools">
     <span id="mHolder">
      <div id="mCon">
       <span>
        輸入法
       </span>
      </div>
     </span>
    </span>
    <ul id="mMenu">
     <li>
      <a href="#" name="ime_hw">
       手寫
      </a>
     </li>
     <li>
      <a href="#" name="ime_py">
       拼音
      </a>
     </li>
     <li class="ln">
     </li>
     <li>
      <a href="#" name="ime_cl">
       關閉
      </a>
     </li>
    </ul>
   </div>
   <p id="lk">
    <a href="http://baike.baidu.com">
     百科
    </a>
    <a href="http://wenku.baidu.com">
     文庫
    </a>
    <a href="http://www.hao123.com">
     hao123
    </a>
    <span>
     |
     <a href="/more/">
      更多&gt;&gt;
     </a>
    </span>
   </p>
   <p id="lm">
   </p>
   <p>
    <a href="http://utility.baidu.com/traf/click.php?id=215&amp;url=http://www.baidu.com" id="seth" onclick="h(this)" onmousedown="return ns_c({'fm':'behs','tab':'homepage','pos':0})">
     把百度設為主頁
    </a>
    <a href="http://www.baidu.com/cache/sethelp/index.html" id="setf" onmousedown="return ns_c({'fm':'behs','tab':'favorites','pos':0})" target="_blank">
     把百度設為主頁
    </a>
    <span id="sekj">
     <a href="http://www.baidu.com/search/baidukuijie_mp.html" onmousedown="return ns_c({'fm':'behs','tab':'kuaijie','pos':1})" target="_blank">
      把百度新增到桌面
     </a>
    </span>
   </p>
   <p id="lh">
    <a href="http://e.baidu.com/?refer=888">
     加入百度推廣
    </a>
    |
    <a href="http://top.baidu.com">
     搜尋風雲榜
    </a>
    |
    <a href="http://home.baidu.com">
     關於百度
    </a>
    |
    <a href="http://ir.baidu.com">
     About Baidu
    </a>
   </p>
   <p id="cp">
    ©2012 Baidu
    <a href="/duty/">
     使用百度前必讀
    </a>
    <a href="http://www.miibeian.gov.cn" target="_blank">
     京ICP證030173號
    </a>
    <img src="http://www.baidu.com/cache/global/img/gs.gif"/>
   </p>
  </div>
 </body>
 <script>
  var bds={se:{},comm : {ishome : 1,sid : "",user : "",username : "",sugHost : "http://suggestion.baidu.com/su",loginAction : []}}
 </script>
 <script src="http://s1.bdstatic.com/r/www/cache/global/js/home-1.1.js" type="text/javascript">
 </script>
 <script>
  var bdUser = null;var w=window,d=document,n=navigator,k=d.f.wd,a=d.getElementById("nv").getElementsByTagName("a"),isIE=n.userAgent.indexOf("MSIE")!=-1&amp;&amp;!window.opera;for(var i=0;i&lt;a.length;i++){a[i].onclick=function(){if(k.value.length&gt;0){var o=this,h=o.href,q=encodeURIComponent(k.value);if(h.indexOf("q=")!=-1){o.href=h.replace(/q=[^&amp;\x24]*/,"q="+q)}else{this.href+="?q="+q}}}};(function(){if(/q=([^&amp;]+)/.test(location.search)){k.value=decodeURIComponent(RegExp["\x241"])}})();if(n.cookieEnabled&amp;&amp;!/sug?=0/.test(d.cookie)){bds.se.sug();};function addEV(o, e, f){if(w.attachEvent){o.attachEvent("on" + e, f);}else if(w.addEventListener){ o.addEventListener(e, f, false);}}function G(id){return d.getElementById(id);}function ns_c(q){var p = encodeURIComponent(window.document.location.href), sQ = '', sV = '', mu='', img = window["BD_PS_C" + (new Date()).getTime()] = new Image();for (v in q) {sV = q[v];sQ += v + "=" + sV + "&amp;";} mu= "&amp;mu=" + p ;img.src = "http://nsclick.baidu.com/v.gif?pid=201&amp;pj=www&amp;rsv_sid=&amp;" + sQ + "path="+p+"&amp;t="+new Date().getTime();return true;}if(/\bbdime=[12]/.test(d.cookie)){document.write('&lt;script src=http://s1.bdstatic.com/r/www/cache/ime/js/openime-1.0.0.js&gt;&lt;\/script&gt;');}(function(){var u = G("u").getElementsByTagName("a"), nv = G("nv").getElementsByTagName("a"), lk = G("lk").getElementsByTagName("a"), un = "";var tj_nv = ["news","tieba","zhidao","mp3","img","video","map"];var tj_lk = ["baike","wenku","hao123","more"];un = bds.comm.user == "" ? "" : bds.comm.user;function _addTJ(obj){addEV(obj, "mousedown", function(e){var e = e || window.event;var target = e.target || e.srcElement;ns_c({'fm':'behs','tab':target.name||'tj_user','un':encodeURIComponent(un)});});}for(var i = 0; i &lt; u.length; i++){_addTJ(u[i]);}for(var i = 0; i &lt; nv.length; i++){nv[i].name = 'tj_' + tj_nv[i];_addTJ(nv[i]);}for(var i = 0; i &lt; lk.length; i++){lk[i].name = 'tj_' + tj_lk[i];_addTJ(lk[i]);}})();addEV(w,"load",function(){k.focus()});w.onunload=function(){};
 </script>
 <script src="http://s1.bdstatic.com/r/www/cache/global/js/tangram-1.3.4c1.0.js" type="text/javascript">
 </script>
 <script src="http://s1.bdstatic.com/r/www/cache/user/js/u-1.3.1.js" type="text/javascript">
 </script>
</html>
<!--254e1a7269462c92-->


相關文章