java爬蟲入門--用jsoup爬取汽車之家的新聞

weixin_33982670發表於2018-02-22

概述

使用jsoup來進行網頁資料爬取。jsoup 是一款Java 的HTML解析器,可直接解析某個URL地址、HTML文字內容。它提供了一套非常省力的API,可通過DOM,CSS以及類似於jQuery的操作方法來取出和運算元據。

詳細

jsoup 是一款Java 的HTML解析器,可直接解析某個URL地址、HTML文字內容。它提供了一套非常省力的API,可通過DOM,CSS以及類似於jQuery的操作方法來取出和運算元據。

jsoup的主要功能如下:

1. 從一個URL,檔案或字串中解析HTML;

2. 使用DOM或CSS選擇器來查詢、取出資料;

3. 可操作HTML元素、屬性、文字;

jsoup是基於MIT協議釋出的,可放心使用於商業專案

第一步:專案預覽

blob.png

第二步:程式碼實現

主程式為GrapNews類,實現了從汽車網摘取相關內容的功能。GrapNews有main函式,執行即可。

 

package net.sinolbs.ycd.news;

import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * TODO
 * 2017年5月21日上午12:25:30
 */
public class GrapNews {
	
    public static boolean isContainChinese(String str) {
        Pattern p = Pattern.compile("[\u4e00-\u9fa5]");
        Matcher m = p.matcher(str);
        if (m.find()) {
            return true;
        }
        return false;
    }
    
    /**
     * 從笑話集抓取笑話
     * @param size
     * @param baseUrl
     * @param domainName
     * @param newsListClassOrId
     * @param classOrId
     * @param newsULIndex
     * @param newsContentClassOrId
     * @param titleTagOrClass
     * @param dateTag
     * @return
     */
    public static ArrayList<News> getNewsFromJokeji(int size,String baseUrl,String domainName,
    		String newsListClassOrId,int newsULIndex,
    		String newsContentClassOrId,String titleTagOrClass,String dateTag){
        ArrayList<News> newsList = new ArrayList<News>();
        Document doc;
        Element element =null;
        Element title =null;
        News news = null;
		try {
			doc = Jsoup.connect(baseUrl).timeout(10000).get();
			element = (Element) doc.getElementsByClass(newsListClassOrId).first();
	        Elements elements = element.getElementsByTag("li");
	        if(elements!=null&&elements.size()>0){
	            for(Element ele:elements){
	                news = new News();
	                title = ele.select("a").first();
	                if(title==null){
	                	continue;
	                }
	                news.setTitle(title.getElementsByTag(titleTagOrClass).text());
	                if(news.getTitle()==null||news.getTitle().equals("")){
	                	continue;
	                }
	                news.setHref(domainName+title.attr("href"));
	                if(dateTag!=null){
	                	String date=ele.select("i").text();
	  	                news.setDate(date);
	                }
	                String newsUrl =news.getHref();
					if (isContainChinese(news.getHref())) {
						newsUrl = URLEncoder.encode(news.getHref(), "utf-8")
								.toLowerCase().replace("%3a", ":").replace("%2f", "/");
					}
	                Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get();
	                String text=newsDoc.getElementById(newsContentClassOrId).html();
	                text=deleteImg(text);
	                text=deleteA(text);
	                 StringBuffer textBuffer=new StringBuffer(5);
	                 textBuffer.append("<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">");
	                 textBuffer.append("</head><body>");
	                 textBuffer.append(deleteSource(text));
	                 textBuffer.append("</body></html>");
	                 news.setContent(textBuffer.toString());
	                 news.setContent(textBuffer.toString());
	                 System.out.println("標題====="+news.getTitle());
	                 System.out.println("href====="+news.getHref());
	                 System.out.println("content====="+news.getContent());
	                 newsList.add(news);
	                if(newsList.size()==size){
	                	break;
	                }
	            }
	        }
		} catch (Exception e) {
			e.printStackTrace();
		}
        return newsList;
    }

    /**
     * 從汽車之家抓新聞
     * @param size
     * @param baseUrl
     * @param domainName
     * @param newsListId
     * @param newsContentClass
     * @param titleTagOrClass
     * @param limitHref
     * @param dateTag
     * @param needDeleteAlt
     * @return
     */
    public static ArrayList<News> getNewsFromCarHome(int size,String baseUrl,String domainName,String newsListId,
    		String newsContentClass,String titleTag,String dateTag,String needDeleteAlt){
        ArrayList<News> newsList = new ArrayList<News>();
        Document doc;
        Elements elements =null;
        Element title =null;
        News news = null;
		try {
			doc = Jsoup.connect(baseUrl).timeout(10000).get();
			elements = (Elements) doc.getElementById(newsListId).children();
	        if(elements!=null&&elements.size()>0){
	            for(Element ele:elements){
	                news = new News();
	                title = ele.select("a").first();
	                if(title==null){
	                	continue;
	                }
	                news.setTitle(title.getElementsByTag(titleTag).text());
	                if(news.getTitle()==null||news.getTitle().equals("")){
	                	continue;
	                }
	                news.setHref(domainName+title.attr("href"));
	                if(dateTag!=null){
	                	String date=ele.select("i").text();
	  	                news.setDate(date);
	                }
	                String newsUrl =news.getHref();
					if (isContainChinese(news.getHref())) {
						newsUrl = URLEncoder.encode(news.getHref(), "utf-8")
								.toLowerCase().replace("%3a", ":").replace("%2f", "/");
					}
	                Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get();
	                String text=newsDoc.getElementsByClass(newsContentClass).html();
	                if(text.indexOf("餘下全文")>0||text.indexOf("未經許可")>0
	                		||text.indexOf("禁止轉載")>0||text.indexOf("公眾號")>0||text.indexOf("公眾賬號")>0){
	                	continue;
	                }
	                 text=replaceImgSrcFromDataSrc(text,true,needDeleteAlt);
	                 int index=text.lastIndexOf("(");
	                 if(index>0){
	                	 text=text.substring(0,index);
	                 }
	                 StringBuffer textBuffer=new StringBuffer(5);
	                 textBuffer.append("<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">");
	                 textBuffer.append("</head><body>");
	                 textBuffer.append(deleteSource(text));
	                 textBuffer.append("</body></html>");
	                 news.setContent(textBuffer.toString());
	                 news.setContent(textBuffer.toString());
	                 System.out.println("標題====="+news.getTitle());
	                 System.out.println("href====="+news.getHref());
	                 System.out.println("content====="+news.getContent());
	                 newsList.add(news);
	                if(newsList.size()==size){
	                	break;
	                }
	            }
	        }
		} catch (Exception e) {
			e.printStackTrace();
		}
        return newsList;
    }
    
    
    
	public static String getVideoFromMiaoPai(String baseUrl) throws Exception{
		Document doc= Jsoup.connect(baseUrl).timeout(10000).get();
		String html=doc.html().trim();
        return getUrlFromMiaoPaiHtml(html);
    }
    
	public static String getUrlFromMiaoPaiHtml(String html){
		int startIndex=html.indexOf("videoSrc");
    	int endIndex=html.indexOf("poster");
    	String videoUrl=html.substring(startIndex+11,endIndex+5);
    	int index=videoUrl.indexOf('"');
    	if(index>0){
    		return videoUrl.substring(0, index);
    	}
    	return videoUrl;
	}
	
	public static String getVideoPhotoFromMiaoPaiHtml(String html){
		System.out.println(html);
		int startIndex=html.indexOf("poster");
		int index=html.substring(startIndex).indexOf("jpg");
    	return html.substring(startIndex+9,startIndex+index+3);
	}
	
    public static void main(String[] args) throws Exception{
    	getNewsFromCarHome(2,"http://m.autohome.com.cn/channel","http://m.autohome.com.cn","list","details","h4","time","汽車之家");
    	getNewsFromJokeji(3,"http://www.jokeji.cn/list.htm","http://www.jokeji.cn","list_title",1,"text110","a","i");
    	getNewsFromSouHu(20,"http://m.sohu.com/c/1592/","a",null,null);
    }
	 /**
     * 從秒拍抓視訊
     * @param size
     * @param baseUrl
     * @param domainName
     * @param newsListId
     * @param newsContentClass
     * @param titleTagOrClass
     * @param limitHref
     * @param dateTag
     * @param needDeleteAlt
     * @return
     */
	public static ArrayList<News> getVideoFromMiaopai(int size,String baseUrl){
		ArrayList<News> newsList = new ArrayList<News>();
		try {
		    News news = null;
		    Element videoEmement=null;
		    Document doc = null;
		    String videoUrl=null;
		    doc = Jsoup.connect(baseUrl).timeout(10000).get();
			Elements 	elements = doc.getElementsByClass("videoCont");
			String videoDetailUrl="";
	        if(elements!=null&&elements.size()>0){
	            for(Element ele:elements){
	            videoEmement=ele.getElementsByClass("MIAOPAI_player").first();
	            String videoId=videoEmement.attr("data-scid").toString();
	            String videoPhotoUrl=videoEmement.attr("data-img").toString();
	            String videoTitle=ele.getElementsByClass("viedoAbout").first().getElementsByTag("p").text();
	            System.out.println("視訊id"+videoId);
	            System.out.println("視訊封面url"+videoPhotoUrl);
	            System.out.println("視訊標題"+videoTitle);
	            news = new News();
	            if(videoId!=null){
	            	news.setTitle(videoTitle);
	            	videoDetailUrl="http://www.miaopai.com/show/"+videoId+".html";
	            	doc = Jsoup.connect("http://www.miaopai.com/show/"+videoId+".html").timeout(10000).get();
	            	System.out.println("視訊詳情url========"+videoDetailUrl);
	            	news.setHref("http://m.miaopai.com/show/"+videoId);
	            	news.setPhotoUrl(videoPhotoUrl);
	            }
	            if(doc!=null){
	            	 videoUrl=getUrlFromMiaoPaiHtml(doc.html());
	            }
	            if(videoUrl!=null){
	                 news.setContent(createVideoHtml(videoUrl, videoPhotoUrl));
	                 System.out.println("視訊url====="+videoUrl);
	                 System.out.println("視訊html======"+news.getContent());
	                 newsList.add(news);
	            }
	           
	            }
	        }
		} catch (Exception e) {
			e.printStackTrace();
		}
        return newsList;
    }

	public static String createVideoHtml(String videoUrl,String videoPhotoUrl) {
		Document doc;
		StringBuffer textBuffer = new StringBuffer(5);
		textBuffer.append("<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">");
		textBuffer.append("</head><body>");
		textBuffer.append("<div align=\"center\">");
		textBuffer.append(" <video></video> </div>");
		textBuffer.append("</body></html>");
		doc = Jsoup.parse(textBuffer.toString());
		doc.getElementsByTag("body").attr("style", "height:400px;");
		doc.getElementsByTag("video").attr("style", "width:100%;max-height:400px;")
			.attr("poster", videoPhotoUrl).attr("autoplay", "autoplay")
				.attr("controls", "controls").attr("src", videoUrl);
		return doc.toString();
	}
	
    /**
     * 從搜狐抓新聞
     * @param size
     * @param baseUrl
     * @param domainName
     * @param newsListId
     * @param newsContentClass
     * @param titleTagOrClass
     * @param limitHref
     * @param dateTag
     * @param needDeleteAlt
     * @return
     */
	public static ArrayList<News> getNewsFromSouHu(int size,String baseUrl,
			String titleTag,String dateTag,String needDeleteAlt){
        ArrayList<News> newsList = new ArrayList<News>();
        Document doc;
        Element element =null;
        Element title =null;
        News news = null;
		try {
			doc = Jsoup.connect(baseUrl).timeout(10000).get();
			element =doc.getElementsByTag("section").get(2);
			element = element.getElementsByClass("headlines").get(0);
			 Elements elements=element.children(); 
	        if(elements!=null&&elements.size()>0){
	            for(Element ele:elements){
	                news = new News();
	                title = ele.select("a").first();
	                if(title==null){
	                	continue;
	                }
	                news.setTitle(title.getElementsByTag(titleTag).text());
	                if(news.getTitle()==null||news.getTitle().equals("")
	                		||news.getTitle().indexOf("廣告")>0||news.getTitle().indexOf("視訊")>0){
	                	continue;
	                }
	                news.setHref("https://m.sohu.com"+title.attr("href"));
	                if(dateTag!=null){
	                    String dateStr=ele.select(dateTag).first().text();
	  	                news.setDate(dateStr);
	                }
	                String newsUrl =news.getHref();
					if (isContainChinese(news.getHref())) {
						newsUrl = URLEncoder.encode(news.getHref(), "utf-8")
								.toLowerCase().replace("%3a", ":").replace("%2f", "/");
					}
	                Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get();
	                String text=newsDoc.getElementsByTag("article").html();
	                if(text.indexOf("未經許可")>0||text.indexOf("禁止轉載")>0
	                		||text.indexOf("公眾號")>0||text.indexOf("公眾賬號")>0){
	                	continue;
	                }
	                int index=text.indexOf("<p class=\"para\">");
	                int lastIndex=text.indexOf("<div class=\"expend-wp\"> ");
	                if(lastIndex>0){
	                	 text=text.substring(index,lastIndex);
	                }else if(index>0){
	                	text=text.substring(index,text.length());
	                }
	                text=replaceImgSrcFromDataSrc(text,true,null);
	                if(text==null||text.length()==0){
	                	continue;
	                }
	                 StringBuffer textBuffer=new StringBuffer(5);
	                 textBuffer.append("<!DOCTYPE html><html><head>"
	                 		+ "<meta name=\"content-type\" content=\"text/html; charset=UTF-8\">");
	                 textBuffer.append("</head><body>");
	                 textBuffer.append(deleteSource(text));
	                 textBuffer.append("</body></html>");
	                 news.setContent(textBuffer.toString());
	                 news.setContent(textBuffer.toString());
	                 System.out.println("標題====="+news.getTitle());
	                 System.out.println("href====="+news.getHref());
	                 System.out.println("content====="+news.getContent());
	                 newsList.add(news);
	                if(newsList.size()==size){
	                	break;
	                }
	            }
	        }
		} catch (Exception e) {
			e.printStackTrace();
		}
        return newsList;
    }
    
    private static String deleteImg(String text) {
		return text.replaceAll("<img [^>]*>", "");
	}
    
    private static String deleteA(String text) {
		return text.replaceAll("<a[^>]*>(.*?)</a>", "");
	}
    
    private static String deleteSource(String text) {
 		return text.replaceAll("\\(.*?\\)|\\[.*?]", "");
 	}
	/**
     * 刪除a標籤中的href
     * @param content
     * @return
     */
    public static String removeHref(String content){  
        Document document = Jsoup.parse(content);  
        Elements elements = document.select("a[href]");  
        for(Element el:elements){  
            el.removeAttr("href");  
        }  
        return document.html();  
    }  
    
    
	/**
	 * 將htmlBody中所有img標籤中的src內容替換為原data-src的內容, <br/>
	 * 如果不報含data-src,則src的內容不會被替換 <br/>
	 * @param htmlBody html內容
	 * @param needDeleteAlt 需要剔除的圖片的alt資訊
	 * @param imgUrlNeedAddProtocolPrefix 圖片的url是否需要新增http協議字首
	 * @return 返回替換後的內容
	 */
	public static String replaceImgSrcFromDataSrc(String htmlBody,
			boolean imgUrlNeedAddProtocolPrefix,String needDeleteAlt) {
		Document document = Jsoup.parseBodyFragment(htmlBody);
		List<Element> nodes = document.select("img");
		int nodeLenth = nodes.size();
		if(nodeLenth==0){
			return htmlBody;
		}
		for (int i = 0; i < nodeLenth; i++) {
			Element e = nodes.get(i);
			String dataSrc = e.attr("data-src");
			if (isNotBlank(dataSrc)) {
				e.attr("src", dataSrc);
				e.removeAttr("data-src");
			}
			String originalSrc = e.attr("original");
			if (isNotBlank(originalSrc)) {
				e.attr("src", "http:"+originalSrc);
				e.removeAttr("originalSrc");
			}
			String originalHiddenSrc = e.attr("original-hidden");
			if (isNotBlank(originalHiddenSrc)) {
				e.attr("src", "http:"+originalHiddenSrc);
				e.removeAttr("original-hidden");
			}
		}
		if (htmlBody.contains("<html>")) {
			if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){
				return document.toString();
			}else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){
				return document.toString().replace("src=\"//", "src=\"http://");
			}else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){
				return document.toString().replace("src=\"//", "src=\"http://")
						.replace("alt="+needDeleteAlt, "");
			}
			return document.toString().replace("alt="+needDeleteAlt, "");
		} else {
			if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){
				return document.select("body>*").toString();
			}else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){
				return document.select("body>*").toString().replace("src=\"//", "src=\"http://");
			}else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){
				return document.select("body>*").
						toString().replace("src=\"//", "src=\"http://").replace("alt="+needDeleteAlt, "");
			}
			return document.select("body>*").toString().replace("alt="+needDeleteAlt, "");
		}
	
	}
	
	
	private static boolean isNotBlank(String str){
		if(str == null)
			return false;
		else if(str.trim().length() == 0)
			return false;
		else
			return true;
	}
}

還有一個載體類,用於把趴下來的網頁內容進行封裝到一個類裡面。

package net.sinolbs.ycd.news;

/**
 * 新聞資料載體
 */
public class News {
	private int id;
	private String title;
	private String href;
	private String content;
    private String date;
    private String photoUrl;
	public News() {
	}

	public News(String title, String href, String content, int id) {
		this.title = title;
		this.content = content;
		this.href = href;
		this.id = id;
	}

	public int getId() {
		return id;
	}

	public void setId(int id) {
		this.id = id;
	}


	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

	public String getHref() {
		return href;
	}

	public void setHref(String href) {
		this.href = href;
	}

	public String getContent() {
		return content;
	}

	public void setContent(String content) {
		this.content = content;
	}

	public String getDate() {
		return date;
	}

	public void setDate(String date) {
		this.date = date;
	}

	public String getPhotoUrl() {
		return photoUrl;
	}

	public void setPhotoUrl(String photoUrl) {
		this.photoUrl = photoUrl;
	}
	
}

第三步:執行效果

執行GrapNews類(有main方法)。

blob.png

 

注:本文著作權歸作者,由demo大師發表,拒絕轉載,轉載需要作者授權

 

相關文章