一個很垃圾的整站爬取--Java爬蟲

rodertW發表於2019-01-07

Jsoup---

讀取檔案中的種子頁,整站爬取整站資料,並儲存。

如果你想簡單用一下,可以,如果學習使用,個人覺得有點亂,

package cn;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class CrawlerUtils {
	public static int count = 0;
	// seeds
	public static List<String> list = new ArrayList<String>();
	// 存所有url
	public static HashSet<String> hashSet = new HashSet<String>();
	// 執行緒池
	ExecutorService pool = Executors.newFixedThreadPool(5);

	public static String gethtml(String url) {
		String content;
		try {
			Connection con = Jsoup.connect(url);
			con.header("Accept", "text/html, application/xhtml+xml, */*");
			con.header("Content-Type", "application/x-www-form-urlencoded");
			con.header("User-Agent",
					"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0))");
			con.header("Cookie", "");
			content = con.get().toString();

		} catch (IOException e) {
			e.printStackTrace();
			return null;
		}
		return content;
	}

	/**
	 * 拿取所有包含主站的url地址 返回list
	 */
	public static List<String> geturl(String html, String url) {
		List<String> list = new ArrayList<String>();
		Pattern pattern = Pattern.compile("href=\"(.*?)\"");
		Matcher matcher = pattern.matcher(html);
		// find向前迭代
		while (matcher.find()) {
			String urlline = matcher.group().replace("href=\"", "")
					.replace("\"", "");
			if (urlline.contains("http")) {
				if (url.contains("http")) {
					if (urlline.contains(url.replace("http://", ""))) {
						System.out.println(urlline);
						list.add(urlline);
					}
				}
			} else if (urlline.contains("https")) {
				if (url.contains("https")) {
					if (urlline.contains(url.replace("https://", ""))) {
						System.out.println(urlline);
						list.add(urlline);
					}
				}
			} else {
				String urlString = url.substring(0, url.length() - 1) + urlline;
				System.out.println(urlString);
				list.add(urlString);
			}
		}
		return list;
	}

	/**
	 * 儲存html -寫入檔案
	 * 
	 * @throws IOException
	 */
	public static void saveFile(String pathname, String html, String charset)
			throws IOException {
		FileUtils.write(new File(pathname), html, charset, true);

	}

	/**
	 * 通過位元組流-寫入檔案
	 * 
	 * @throws IOException
	 */
	public static void WriteByte(String pathname, String date, String charset)
			throws IOException {
		File file = new File(pathname);
		OutputStream outputStream = new FileOutputStream(file);
		byte[] datebyte = date.getBytes(charset);
		outputStream.write(datebyte);
		outputStream.close();
	}

	/** 主執行類 */
	public static void mainUtil(String url, String maintitle) {
		try {
			String html = gethtml(url);
			System.out.println(html);
			List<String> urlList = geturl(html, url);
			for (String string : urlList) {
				if (hashSet.add(string)) {

					String htmlline = gethtml(string);
					try {
						String title = "未命名";
						title = Jsoup.parse(htmlline).getElementsByTag("title")
								.get(0).text();
						saveFile("E://crawler4j/房地產行業/" + maintitle + "/"
								+ title + System.currentTimeMillis() + ".html",
								htmlline, "utf8");
						System.out.println("第" + count++ + "儲存檔案:" + string);
					} catch (Exception e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
						System.out.println("第" + count + "寫入失敗!!!" + "網址:"
								+ string);
					}
				}
			}
		} catch (Exception e) {
			// TODO: handle exception
			System.out.println("99999999999");
		}

	}

	public static void main(String[] args) {

		try {
			FileReader reader = new FileReader("E://crawler4j/房地產行業seeds.txt");
			BufferedReader br = new BufferedReader(reader);
			while (br.ready()) {
				String line = br.readLine();
				list.add(line);
			}
			br.close();
			reader.close();
		} catch (Exception e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
			System.out.println("沒有種子頁!!");
		}

		String url1 = "http://www.minagri.gov.rw/index.php?id=16";

		for (String url : list) {
			String maintitle = "未命名" + System.currentTimeMillis();
			try {
				maintitle = Jsoup.connect(url).get().getElementsByTag("title")
						.get(0).text();
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				continue;
			}
			mainUtil(url, maintitle);
			String html = gethtml(url);
			if (html.equals(null)) {
				continue;
			}
			System.out.println(html);
			List<String> urlList = geturl(html, url);
			for (int i = 0; i < urlList.size(); i++) {
				mainUtil(urlList.get(i), maintitle);
			}
		}
	}
}

 

相關文章