java抓取網頁的亂碼問題(通用)

悠悠隱於市發表於2011-03-17
java抓取網頁的亂碼問題(通用)2010-12-22 13:34//解決抓取頁面的亂碼問題

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.net.URLConnection;

public class DownPage {
    public static void main(String args[]) throws IOException{

//抓取的頁面地址
        String urlStr = "http://www.baidu.com";
        URL url = new URL(urlStr);
        URLConnection connection = url.openConnection();

//獲得該網頁的編碼
        String ss = connection.getContentType();
        System.out.println(ss);

//查InputStreamReader的構造方法,gb2312為該頁面的編碼
        BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream(),"gb2312"));
        File file = new File("d:/a.html");
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file)));
        if(br != null){
            String s = null;
            while((s = br.readLine())!=null){
                //String sss = new String(s.getBytes("gb2312"),"windows-31J");
                System.out.println(s);
                bw.write(s);
                bw.flush();
            }
            bw.close();
            br.close();
        }
    }
}
 

 

相關文章