一個很垃圾的整站爬取--Java爬蟲

rodertW發表於2019-01-07

原文網址 : https://blog.csdn.net/qq_40374604/article/details/86029399

Jsoup---

讀取檔案中的種子頁，整站爬取整站資料，並儲存。

如果你想簡單用一下，可以，如果學習使用，個人覺得有點亂，

package cn;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class CrawlerUtils {
	public static int count = 0;
	// seeds
	public static List<String> list = new ArrayList<String>();
	// 存所有url
	public static HashSet<String> hashSet = new HashSet<String>();
	// 執行緒池
	ExecutorService pool = Executors.newFixedThreadPool(5);

	public static String gethtml(String url) {
		String content;
		try {
			Connection con = Jsoup.connect(url);
			con.header("Accept", "text/html, application/xhtml+xml, */*");
			con.header("Content-Type", "application/x-www-form-urlencoded");
			con.header("User-Agent",
					"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0))");
			con.header("Cookie", "");
			content = con.get().toString();

		} catch (IOException e) {
			e.printStackTrace();
			return null;
		}
		return content;
	}

	/**
	 * 拿取所有包含主站的url地址 返回list
	 */
	public static List<String> geturl(String html, String url) {
		List<String> list = new ArrayList<String>();
		Pattern pattern = Pattern.compile("href=\"(.*?)\"");
		Matcher matcher = pattern.matcher(html);
		// find向前迭代
		while (matcher.find()) {
			String urlline = matcher.group().replace("href=\"", "")
					.replace("\"", "");
			if (urlline.contains("http")) {
				if (url.contains("http")) {
					if (urlline.contains(url.replace("http://", ""))) {
						System.out.println(urlline);
						list.add(urlline);
					}
				}
			} else if (urlline.contains("https")) {
				if (url.contains("https")) {
					if (urlline.contains(url.replace("https://", ""))) {
						System.out.println(urlline);
						list.add(urlline);
					}
				}
			} else {
				String urlString = url.substring(0, url.length() - 1) + urlline;
				System.out.println(urlString);
				list.add(urlString);
			}
		}
		return list;
	}

	/**
	 * 儲存html -寫入檔案
	 * 
	 * @throws IOException
	 */
	public static void saveFile(String pathname, String html, String charset)
			throws IOException {
		FileUtils.write(new File(pathname), html, charset, true);

	}

	/**
	 * 通過位元組流-寫入檔案
	 * 
	 * @throws IOException
	 */
	public static void WriteByte(String pathname, String date, String charset)
			throws IOException {
		File file = new File(pathname);
		OutputStream outputStream = new FileOutputStream(file);
		byte[] datebyte = date.getBytes(charset);
		outputStream.write(datebyte);
		outputStream.close();
	}

	/** 主執行類 */
	public static void mainUtil(String url, String maintitle) {
		try {
			String html = gethtml(url);
			System.out.println(html);
			List<String> urlList = geturl(html, url);
			for (String string : urlList) {
				if (hashSet.add(string)) {

					String htmlline = gethtml(string);
					try {
						String title = "未命名";
						title = Jsoup.parse(htmlline).getElementsByTag("title")
								.get(0).text();
						saveFile("E://crawler4j/房地產行業/" + maintitle + "/"
								+ title + System.currentTimeMillis() + ".html",
								htmlline, "utf8");
						System.out.println("第" + count++ + "儲存檔案：" + string);
					} catch (Exception e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
						System.out.println("第" + count + "寫入失敗！！！" + "網址："
								+ string);
					}
				}
			}
		} catch (Exception e) {
			// TODO: handle exception
			System.out.println("99999999999");
		}

	}

	public static void main(String[] args) {

		try {
			FileReader reader = new FileReader("E://crawler4j/房地產行業seeds.txt");
			BufferedReader br = new BufferedReader(reader);
			while (br.ready()) {
				String line = br.readLine();
				list.add(line);
			}
			br.close();
			reader.close();
		} catch (Exception e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
			System.out.println("沒有種子頁！！");
		}

		String url1 = "http://www.minagri.gov.rw/index.php?id=16";

		for (String url : list) {
			String maintitle = "未命名" + System.currentTimeMillis();
			try {
				maintitle = Jsoup.connect(url).get().getElementsByTag("title")
						.get(0).text();
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				continue;
			}
			mainUtil(url, maintitle);
			String html = gethtml(url);
			if (html.equals(null)) {
				continue;
			}
			System.out.println(html);
			List<String> urlList = geturl(html, url);
			for (int i = 0; i < urlList.size(); i++) {
				mainUtil(urlList.get(i), maintitle);
			}
		}
	}
}

Java爬蟲批量爬取圖片
2021-09-24
Java爬蟲
Python 第一個爬蟲，爬取 147 小說
2020-05-08
Python爬蟲
Java爬蟲-爬取疫苗批次資訊
2024-06-03
Java爬蟲
擼個爬蟲，爬取電影種子
2019-05-11
爬蟲
不會Python爬蟲？教你一個通用爬蟲思路輕鬆爬取網頁資料
2019-01-08
Python爬蟲網頁
Python爬蟲和java爬蟲哪個效率高
2023-10-12
Python爬蟲Java
python爬蟲58同城（多個資訊一次爬取）
2018-11-04
Python爬蟲
python爬蟲---網頁爬蟲，圖片爬蟲，文章爬蟲，Python爬蟲爬取新聞網站新聞
2019-01-04
Python爬蟲網頁網站
Java爬蟲與Python爬蟲的區別？
2023-10-25
Java爬蟲Python
爬蟲之股票定向爬取
2018-12-06
爬蟲
提高爬蟲爬取效率的辦法
2022-04-06
爬蟲
一天時間入門python爬蟲，直接寫一個爬蟲案例，分享出來，很簡單
2018-12-02
Python爬蟲
python爬蟲初探--第一個python爬蟲專案
2018-05-18
Python爬蟲
java 爬蟲大型教程（一）
2019-01-24
Java爬蟲
分散式爬蟲很難嗎？用Python寫一個小白也能聽懂的分散式知乎爬蟲
2018-05-04
分散式爬蟲Python
爬蟲爬取微信小程式
2019-02-16
爬蟲微信小程式
如何合理控制爬蟲爬取速度？
2022-06-02
爬蟲
精通Scrapy網路爬蟲【一】第一個爬蟲專案
2021-06-19
爬蟲
如何提高爬取爬蟲採集的效率？
2022-06-11
爬蟲
【Python學習】爬蟲爬蟲爬蟲爬蟲~
2018-05-03
Python爬蟲
java實現一個簡單的爬蟲小程式
2020-08-11
Java爬蟲
使用java 爬蟲
2020-10-05
Java爬蟲
node：爬蟲爬取網頁圖片
2019-02-16
爬蟲網頁
python 爬蟲爬取 learnku 精華文章
2020-04-17
Python爬蟲
爬蟲---xpath解析（爬取美女圖片）
2020-12-23
爬蟲
我的第一個 scrapy 爬蟲
2019-02-16
爬蟲
網路爬蟲——爬蟲實戰（一）
2022-01-29
爬蟲
IPIDEA乾貨|Java爬蟲與Python爬蟲的區別
2023-05-08
IdeaJava爬蟲Python
一起學爬蟲——使用Beautiful Soup爬取網頁
2018-11-26
爬蟲網頁
Python爬蟲與Java爬蟲有何區別？
2022-06-01
Python爬蟲Java
python爬蟲-1w+套個人簡歷模板爬取
2021-03-05
Python爬蟲
Java爬蟲系列四：使用selenium-java爬取js非同步請求的資料
2021-10-17
Java爬蟲JS非同步
Java 月薪25K的爬蟲工程師對爬蟲的流程做了一個非常全面的總結！
2018-08-21
Java爬蟲工程師
python爬取換頁_爬蟲爬不進下一頁了，怎麼辦
2020-11-24
Python爬蟲
每天一個爬蟲-learnku
2021-06-16
爬蟲
用python寫一個豆瓣短評通用爬蟲(登入、爬取、視覺化)
2020-10-24
Python爬蟲視覺化
爬蟲 Scrapy框架爬取圖蟲圖片並下載
2018-08-27
爬蟲框架
python爬蟲——爬取大學排名資訊
2019-08-02
Python爬蟲

一個很垃圾的整站爬取--Java爬蟲

相關文章