網路爬蟲示例

weiqiangGG發表於2018-10-30

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WebSpider1 {
	public static String getContent(String urlStr, String s) {
		StringBuilder sb = new StringBuilder();
			try {
				URL url = new URL(urlStr);
				BufferedReader bf = new BufferedReader(new InputStreamReader(url.openStream(), s));
				String temp = "";
				while((temp=bf.readLine())!=null) {
					sb.append(temp);
				}
			} catch (MalformedURLException e) {
				e.printStackTrace();
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		return sb.toString();
		
	}
	public static List<String> getStr(String des, String regexStr){
		Pattern p = Pattern.compile(regexStr);
		Matcher m = p.matcher(des);
		List<String> list = new ArrayList<String>();
		while(m.find()) {
			list.add(m.group(1));
		}
		return list;
	}
	public static void main(String[] args) {
		String des = getContent("http://www.163.com", "gbk");
		List<String> str = getStr(des,"href=\"([\\w\\s./:]*?)\"");
		for(String temp : str) {
			System.out.println(temp);
		}
		
	}
}

相關文章