java抓取網頁的郵件地址

悠悠隱於市發表於2011-03-17
package pack.java.url;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class GetEmailDemo {

	/**
	 * 測試方法;
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		GetEmailDemo emailDemo = new GetEmailDemo();
		String error = emailDemo.getWebContent("http://www.tianya.cn/publicforum/content/no04/1/1456104.shtml");
		System.out.println(error);
		System.out.println("郵件地址查詢完成...");
	}
	
	/**
	 * 獲得網頁中的原始碼; 逐行解析;
	 * @param path
	 * @return
	 */
	private String getWebContent(String path){
		BufferedReader bufferedReader = null;
		StringBuffer sb = new StringBuffer();
		if(path!=null && !"".equals(path)){
			try {
				URL url = new URL(path);
				bufferedReader = new BufferedReader(new InputStreamReader(url.openStream()));
				String line = null;
				System.out.println("開始分析郵件地址...");
				while ((line = bufferedReader.readLine())!=null) {
					//分析是否有email地址;
					parse(line);
				}
			} catch (MalformedURLException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				sb.append(e.toString());
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				sb.append(e.toString());
			}finally{
				try {
					bufferedReader.close();
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
					sb.append(e.toString());
				}
			}
		}
		return sb.toString();
	}
	
	/**
	 * 分析資料;
	 * @param line
	 */
	private void parse(String line){
		if(line!=null && line.length()>0){
			 //郵箱正規表示式;
			String regexExpression = "[\\w[.-]]+@[\\w[.-]]+\\.[\\w]+";
			Pattern pattern = Pattern.compile(regexExpression);
			Matcher matcher = pattern.matcher(line);
			while (matcher.find()) {
				System.out.println(matcher.group());
			}
		}
	}
}

 

相關文章