java 爬取網頁內容。標題、圖片等

FH-Admin發表於2021-09-24

原文網址 : https://learnku.com/articles/61080?order_by=created_at&

package com.fh.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 說明：爬取網頁
 * 作者：FH Admin
 * from：fhadmin.cn
 */
public class GetWeb {

    /**
     * 獲取當前網頁的code
     * 
     * @param httpUrl
     *            網頁地址
     * @return
     * @throws IOException
     */
    public static String getHtmlCode(String httpUrl) throws IOException {
        String content = "";         // 定義字串content
        URL url = new URL(httpUrl); // 生成傳入的URL的物件
        BufferedReader reader = new BufferedReader(new InputStreamReader(
                url.openStream(), "utf-8"));// 獲得當前url的位元組流（緩衝）
        String input;
        while ((input = reader.readLine()) != null) { // 當前行存在資料時
            content += input;         // 將讀取資料賦給content
        }
        reader.close();             // 關閉緩衝區
        return content;
    }

    /**
     * 把網頁中的所有圖片的完整路徑放到list裡面
     * 
     * @param wwwurl
     *            要爬的網頁連線
     * @throws IOException
     */
    public static List<String> getImagePathList(String httpUrl)
            throws IOException {

        // 透過副檔名匹配網頁圖片的正規表示式
        // String searchImgReg =
        // "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
        // String searchImgReg2 =
        // "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
        // 透過img標籤匹配網頁圖片的正規表示式
        String searchImgReg = "<(img|IMG)\\b[^>]*\\b(src|SRC|src2|SRC2)\\b\\s*=\\s*('|\")?([^'\"\n\r\f>]+(\\.jpg|\\.bmp|\\.eps|\\.gif|\\.mif|\\.miff|\\.png|\\.tif|\\.tiff|\\.svg|\\.wmf|\\.jpe|\\.jpeg|\\.dib|\\.ico|\\.tga|\\.cut|\\.pic)\\b)[^>]*>";
        List<String> imgList = new ArrayList<String>();     // 存放圖片的list
        String content = null;
        content = getHtmlCode(httpUrl);                        // 獲得content
        Pattern pattern = Pattern.compile(searchImgReg);     // 講編譯的正規表示式物件賦給pattern
        Matcher matcher = pattern.matcher(content);         // 對字串content執行正規表示式
        while (matcher.find()) {
            String quote = matcher.group(3);
            String imgsrc = (quote == null || quote.trim().length() == 0) ? matcher.group(4).split("\\s+")[0] : matcher.group(4);
            if (!imgsrc.startsWith("http://") && !imgsrc.startsWith("https://")) {             // 檢驗地址是否http://
                String[] httpUrlarr = httpUrl.split("/");
                String wwwhost = httpUrlarr[0] + "//" + httpUrlarr[2]; //獲取域名完整地址
                if(!isNetFileAvailable(wwwhost + "/" + imgsrc)){
                    for(int i=3;i<httpUrlarr.length;i++){
                        wwwhost = wwwhost + "/" + httpUrlarr[i];
                        if(isNetFileAvailable(wwwhost + "/" + imgsrc)){
                            imgsrc = wwwhost + "/" + imgsrc;
                            break;
                        }
                    }
                }else{
                    imgsrc = wwwhost + "/" + imgsrc;
                }
            }
            imgList.add(imgsrc);
        }
        return imgList;

    }

    /**
     * 獲取網頁的標題
     * 
     * @param httpUrl
     *            要爬的網頁連線
     * @return
     */
    public static String getTilte(String httpUrl) {

        String searchTitle = "(<title>|<TITLE>)(.*?)(</title>|</TITLE>)"; // 獲取網頁的標題的正規表示式
        Pattern pattern = Pattern.compile(searchTitle); // 獲得content
        try {
            Matcher matcher = pattern.matcher(getHtmlCode(httpUrl));
            while (matcher.find()) {
                return matcher.group(2);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;

    }

    /**
     * 檢測網路資源是否存在　
     * 
     * @param strUrl
     * @return
     */
    public static boolean isNetFileAvailable(String strUrl) {
        InputStream netFileInputStream = null;
        try {
            URL url = new URL(strUrl);
            URLConnection urlConn = url.openConnection();
            netFileInputStream = urlConn.getInputStream();
            if (null != netFileInputStream) {
                return true;
            } else {
                return false;
            }
        } catch (IOException e) {
            return false;
        } finally {
            try {
                if (netFileInputStream != null)
                    netFileInputStream.close();
            } catch (IOException e) {
            }
        }
    }
}

本作品採用《CC 協議》，轉載必須註明作者和本文連結

node：爬蟲爬取網頁圖片
2019-02-16
爬蟲網頁
簡單的爬蟲：爬取網站內容正文與圖片
2021-09-09
爬蟲網站
python 爬蟲如何爬取動態生成的網頁內容
2024-10-31
Python爬蟲網頁
Python應用開發——爬取網頁圖片
2022-09-21
Python網頁
Java爬蟲批量爬取圖片
2021-09-24
Java爬蟲
Python 爬取網頁中JavaScript動態新增的內容（一）
2018-09-28
Python網頁JavaScript
Python 爬取網頁中JavaScript動態新增的內容（二）
2018-09-28
Python網頁JavaScript
Python爬蟲—爬取某網站圖片
2020-11-19
Python爬蟲網站
網路爬蟲---從千圖網爬取圖片到本地
2019-09-03
爬蟲
網頁設計內容網頁中關於圖片預覽的設計
2019-01-14
網頁
python爬蟲---網頁爬蟲，圖片爬蟲，文章爬蟲，Python爬蟲爬取新聞網站新聞
2019-01-04
Python爬蟲網頁網站
爬取愛套圖網上的圖片
2018-03-28
python爬蟲：使用BeautifulSoup修改網頁內容
2020-04-05
Python爬蟲網頁
Python 爬蟲進階篇-利用beautifulsoup庫爬取網頁文章內容實戰演示
2020-09-14
Python爬蟲網頁
帝國cms內容頁圖片自動獲取alt和title的方法
2020-06-10
Python 爬蟲網頁內容提取工具xpath(二)
2018-12-08
Python爬蟲網頁
Python 爬蟲網頁內容提取工具xpath(一)
2018-12-06
Python爬蟲網頁
自學python網路爬蟲，從小白快速成長，分別實現靜態網頁爬取，下載meiztu中圖片；動態網頁爬取，下載burberry官網所有當季新品圖片。
2020-02-06
Python爬蟲網頁
Python爬取網頁的所有內外鏈
2021-04-09
Python網頁
python 爬蟲之requests爬取頁面圖片的url，並將圖片下載到本地
2019-06-12
Python爬蟲
Node JS爬蟲：爬取瀑布流網頁高清圖
2018-05-17
JS爬蟲網頁
AotucCrawler 快速爬取圖片
2021-11-25
爬取網頁文章
2021-09-29
網頁
php獲取網頁內容的三種方法
2018-10-17
PHP網頁
JavaScript 獲取網頁尾本程式碼內容
2020-02-20
JavaScript網頁
[譯] 如何使用 Python 和 BeautifulSoup 爬取網站內容
2019-02-23
Python網站
爬蟲---xpath解析（爬取美女圖片）
2020-12-23
爬蟲
Chrome 獲取網頁顏色（文字、圖片）
2022-04-06
Chrome網頁
爬蟲——網頁爬取方法和網頁解析方法
2020-12-07
爬蟲網頁
帝國cms標題設定了加粗、顏色等屬性在內容頁顯示
2024-10-05
利用Python爬取攝影網站圖片，切勿商用
2018-12-18
Python網站
最簡單的網路圖片的爬取 --Pyhon網路爬蟲與資訊獲取
2020-04-04
爬蟲
go語言實現簡單爬蟲獲取頁面圖片
2022-11-14
Go爬蟲
爬蟲 Scrapy框架爬取圖蟲圖片並下載
2018-08-27
爬蟲框架
python爬取網圖
2019-10-15
Python
Python網路爬蟲2 - 爬取新浪微博使用者圖片
2018-04-10
Python爬蟲
Python爬蟲入門【4】：美空網未登入圖片爬取
2019-07-30
Python爬蟲
爬蟲Selenium+PhantomJS爬取動態網站圖片資訊（Python）
2018-03-24
爬蟲JS網站Python

java 爬取網頁內容。 標題、圖片等

相關文章

java 爬取網頁內容。標題、圖片等