網頁主動探測工具-NIO優化

壹頁書發表於2017-03-07
接前文
http://blog.itpub.net/29254281/viewspace-1344706/
http://blog.itpub.net/29254281/viewspace-1347985/

建表語句:
CREATE SEQUENCE seq_probe_id INCREMENT BY 1 START WITH 1 NOMAXvalue NOCYCLE CACHE 2000;

create table probe( 
    id int primary key,
    host varchar(40) not null, 
    path varchar(500) not null,
    state int not null,
    taskTime int not null, 
    type varchar(10) not null,
    createtime date default sysdate not null 
) ;

使用NIO優化這個程式,進一步壓榨資源使用率,已經想了好長時間了
無奈NIO+多執行緒,網上例子都不是很靠譜.自己學的也非常頭疼,一拖就是一年多.

新的程式,採用三段過程
首先 使用一個執行緒池不斷的傳送連線請求,但是不處理接收.僅僅註冊一個SelectionKey.OP_READ的鍵
另外的一個單執行緒 程式,不斷select符合條件的通道,然後分配給另外一個執行緒池,用於接收資料,解析資料.(接收和解析的過程合併了)
最後,使用一個單執行緒的程式,不斷的把結果通過批量的方式刷入資料庫.這塊也算一個優化.由單條Insert改為批量入庫.這塊至少節約了一個CPU核的處理能力.

持久化過程和解析過程 基本複用了原來的程式碼

  1. import java.io.IOException;  
  2. import java.net.InetSocketAddress;  
  3. import java.net.SocketAddress;  
  4. import java.nio.ByteBuffer;  
  5. import java.nio.channels.SelectionKey;  
  6. import java.nio.channels.Selector;  
  7. import java.nio.channels.SocketChannel;  
  8. import java.nio.charset.Charset;  
  9. import java.sql.Connection;  
  10. import java.sql.DriverManager;  
  11. import java.sql.PreparedStatement;  
  12. import java.sql.SQLException;  
  13. import java.util.ArrayList;  
  14. import java.util.HashSet;  
  15. import java.util.Iterator;  
  16. import java.util.List;  
  17. import java.util.Set;  
  18. import java.util.concurrent.BlockingQueue;  
  19. import java.util.concurrent.CopyOnWriteArrayList;  
  20. import java.util.concurrent.ExecutorService;  
  21. import java.util.concurrent.Executors;  
  22. import java.util.concurrent.LinkedBlockingQueue;  
  23. import java.util.concurrent.atomic.AtomicInteger;  
  24. import java.util.regex.Matcher;  
  25. import java.util.regex.Pattern;  
  26.   
  27. public class Probe {  
  28.     private static final int REQUESTTHREADCOUNT = 10;  
  29.     private static final BlockingQueue CONNECTLIST = new LinkedBlockingQueue();  
  30.     private static final BlockingQueue PERSISTENCELIST = new LinkedBlockingQueue();  
  31.   
  32.     private static ExecutorService REQUESTTHREADPOOL;  
  33.     private static ExecutorService RESPONSETHREADPOOL;  
  34.   
  35.     private static ExecutorService PERSISTENCETHREADPOOL;  
  36.     private static final List DOMAINLIST = new CopyOnWriteArrayList<>();  
  37.     private static Selector SELECTOR;  
  38.     static {  
  39.         REQUESTTHREADPOOL = Executors.newFixedThreadPool(REQUESTTHREADCOUNT);  
  40.         RESPONSETHREADPOOL = Executors.newFixedThreadPool(3);  
  41.         PERSISTENCETHREADPOOL = Executors.newFixedThreadPool(1);  
  42.         DOMAINLIST.add("news.163.com");  
  43.         try {  
  44.             SELECTOR = Selector.open();  
  45.         } catch (IOException e) {  
  46.             e.printStackTrace();  
  47.         }  
  48.     }  
  49.   
  50.     public static void main(String[] args) throws IOException, InterruptedException {  
  51.         long start = System.currentTimeMillis();  
  52.         CONNECTLIST.put(new Task("news.163.com"80"/index.html"));  
  53.         for (int i = 0; i < REQUESTTHREADCOUNT; i++) {  
  54.             REQUESTTHREADPOOL.submit(new RequestHandler(CONNECTLIST, SELECTOR));  
  55.         }  
  56.         RESPONSETHREADPOOL  
  57.                 .submit(new ResponseHandler(SELECTOR, CONNECTLIST, PERSISTENCELIST, DOMAINLIST, RESPONSETHREADPOOL));  
  58.         PERSISTENCETHREADPOOL.submit(new PersistenceHandler(PERSISTENCELIST));  
  59.   
  60.         while (true) {  
  61.             Thread.sleep(1000);  
  62.             long end = System.currentTimeMillis();  
  63.             float interval = ((end - start) / 1000);  
  64.             int connectTotal = ResponseHandler.GETCOUNT();  
  65.   
  66.             int persistenceTotal = PersistenceHandler.GETCOUNT();  
  67.   
  68.             int connectps = Math.round(connectTotal / interval);  
  69.             int persistenceps = Math.round(persistenceTotal / interval);  
  70.             System.out.print(  
  71.                     "\r連線總數:" + connectTotal + " \t每秒連線:" + connectps + "\t連線佇列剩餘:" + CONNECTLIST.size() + " \t持久化總數:"  
  72.                             + persistenceTotal + " \t每秒持久化:" + persistenceps + "\t持久化佇列剩餘:" + PERSISTENCELIST.size());  
  73.         }  
  74.     }  
  75. }  
  76.   
  77. class RequestHandler implements Runnable {  
  78.     BlockingQueue connectlist;  
  79.     Selector selector;  
  80.   
  81.     public RequestHandler(BlockingQueue connectlist, Selector selector) {  
  82.         this.connectlist = connectlist;  
  83.         this.selector = selector;  
  84.     }  
  85.   
  86.     @Override  
  87.     public void run() {  
  88.         while (true) {  
  89.             try {  
  90.                 Task task = (Task) connectlist.take();  
  91.                 SocketAddress addr = new InetSocketAddress(task.getHost(), 80);  
  92.                 SocketChannel socketChannel = SocketChannel.open(addr);  
  93.   
  94.                 socketChannel.configureBlocking(false);  
  95.   
  96.                 ByteBuffer byteBuffer = ByteBuffer.allocate(2400);  
  97.                 byteBuffer.put(("GET " + task.getCurrentPath() + " HTTP/1.0\r\n").getBytes("utf8"));  
  98.                 byteBuffer.put(("HOST:" + task.getHost() + "\r\n").getBytes("utf8"));  
  99.                 byteBuffer.put(("Accept:*/*\r\n").getBytes("utf8"));  
  100.                 byteBuffer.put(("\r\n").getBytes("utf8"));  
  101.                 byteBuffer.flip();  
  102.                 socketChannel.write(byteBuffer);  
  103.                 byteBuffer.clear();  
  104.   
  105.                 socketChannel.register(selector, SelectionKey.OP_READ, task);  
  106.                 selector.wakeup();  
  107.             } catch (Exception e) {  
  108.                 e.printStackTrace();  
  109.             }  
  110.         }  
  111.     }  
  112. }  
  113.   
  114. class ResponseHandler implements Runnable {  
  115.     Selector selector;  
  116.     BlockingQueue connectlist;  
  117.     BlockingQueue persistencelist;  
  118.     List domainlist;  
  119.     ExecutorService threadPool;  
  120.     Charset charset = Charset.forName("utf8");  
  121.     Charset gbkcharset = Charset.forName("gbk");  
  122.   
  123.     public static int GETCOUNT() {  
  124.         return COUNT.get();  
  125.     }  
  126.   
  127.     private static final AtomicInteger COUNT = new AtomicInteger();  
  128.   
  129.     public ResponseHandler(Selector selector, BlockingQueue connectlist, BlockingQueue persistencelist, List domainlist,  
  130.             ExecutorService threadpool) {  
  131.         this.selector = selector;  
  132.         this.connectlist = connectlist;  
  133.         this.persistencelist = persistencelist;  
  134.         this.domainlist = domainlist;  
  135.         this.threadPool = threadpool;  
  136.     }  
  137.   
  138.     @Override  
  139.     public void run() {  
  140.         while (true) {  
  141.             try {  
  142.                 int n = selector.selectNow();  
  143.                 if (n == 0)  
  144.                     continue;  
  145.                 Iterator it = selector.selectedKeys().iterator();  
  146.                 while (it.hasNext()) {  
  147.   
  148.                     SelectionKey key = (SelectionKey) it.next();  
  149.                     if (key.isReadable() && key.isValid()) {  
  150.                         key.interestOps(key.interestOps() & (~SelectionKey.OP_READ));  
  151.                         Runnable r = new Runnable() {  
  152.   
  153.                             @Override  
  154.                             public void run() {  
  155.                                 try {  
  156.                                     Task task = (Task) key.attachment();  
  157.   
  158.                                     ByteBuffer byteBuffer = ByteBuffer.allocate(2400);  
  159.                                     SocketChannel channel = (SocketChannel) key.channel();  
  160.   
  161.                                     int length;  
  162.                                     while ((length = channel.read(byteBuffer)) > 0) {  
  163.                                         byteBuffer.flip();  
  164.                                         task.appendContent(charset.decode(charset.encode(gbkcharset.decode(byteBuffer)))  
  165.                                                 .toString());  
  166.   
  167.                                         byteBuffer.compact();  
  168.                                     }  
  169.                                     if (length == -1) {  
  170.                                         channel.close();  
  171.                                           
  172.                                         COUNT.incrementAndGet();  
  173.                                         new ParseHandler(task, connectlist, persistencelist, domainlist).handler();  
  174.                                     } else {  
  175.                                         channel.register(selector, SelectionKey.OP_READ, task);  
  176.                                     }  
  177.                                     key.selector().wakeup();  
  178.                                 } catch (Exception e) {  
  179.                                     try {  
  180.                                         key.cancel();  
  181.                                         key.channel().close();  
  182.                                     } catch (IOException e1) {  
  183.                                         e1.printStackTrace();  
  184.                                     }  
  185.                                     e.printStackTrace();  
  186.                                 }  
  187.   
  188.                             }  
  189.                         };  
  190.                         threadPool.submit(r);  
  191.                     }  
  192.                     it.remove();  
  193.                 }  
  194.   
  195.             } catch (Exception e) {  
  196.                 e.printStackTrace();  
  197.             }  
  198.         }  
  199.   
  200.     }  
  201. }  
  202.   
  203. class ParseHandler {  
  204.     private static final Set SET = new HashSet();  
  205.   
  206.     private BlockingQueue connectlist;  
  207.   
  208.     private BlockingQueue persistencelist;  
  209.     List domainlist;  
  210.   
  211.     Task task;  
  212.   
  213.     private interface Filter {  
  214.         void doFilter(Task fatherTask, Task newTask, String path, Filter chain);  
  215.     }  
  216.   
  217.     private class FilterChain implements Filter {  
  218.         private List list = new ArrayList();  
  219.   
  220.         {  
  221.             addFilter(new TwoLevel());  
  222.             addFilter(new OneLevel());  
  223.             addFilter(new FullPath());  
  224.             addFilter(new Root());  
  225.             addFilter(new Default());  
  226.         }  
  227.   
  228.         private void addFilter(Filter filter) {  
  229.             list.add(filter);  
  230.         }  
  231.   
  232.         private Iterator it = list.iterator();  
  233.   
  234.         @Override  
  235.         public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {  
  236.             if (it.hasNext()) {  
  237.                 ((Filter) it.next()).doFilter(fatherTask, newTask, path, chain);  
  238.             }  
  239.         }  
  240.   
  241.     }  
  242.   
  243.     private class TwoLevel implements Filter {  
  244.   
  245.         @Override  
  246.         public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {  
  247.             if (path.startsWith("../../")) {  
  248.                 String prefix = getPrefix(fatherTask.getCurrentPath(), 3);  
  249.                 newTask.init(fatherTask.getHost(), fatherTask.getPort(), path.replace("../../", prefix));  
  250.             } else {  
  251.                 chain.doFilter(fatherTask, newTask, path, chain);  
  252.             }  
  253.   
  254.         }  
  255.     }  
  256.   
  257.     private class OneLevel implements Filter {  
  258.   
  259.         @Override  
  260.         public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {  
  261.             if (path.startsWith("../")) {  
  262.                 String prefix = getPrefix(fatherTask.getCurrentPath(), 2);  
  263.                 newTask.init(fatherTask.getHost(), fatherTask.getPort(), path.replace("../", prefix));  
  264.             } else {  
  265.                 chain.doFilter(fatherTask, newTask, path, chain);  
  266.             }  
  267.   
  268.         }  
  269.   
  270.     }  
  271.   
  272.     private class FullPath implements Filter {  
  273.   
  274.         @Override  
  275.         public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {  
  276.             if (path.startsWith("http://")) {  
  277.                 Iterator it = domainlist.iterator();  
  278.                 boolean flag = false;  
  279.                 while (it.hasNext()) {  
  280.                     String domain = (String) it.next();  
  281.                     if (path.startsWith("http://" + domain + "/")) {  
  282.                         newTask.init(domain, fatherTask.getPort(), path.replace("http://" + domain + "/""/"));  
  283.                         flag = true;  
  284.                         break;  
  285.                     }  
  286.                 }  
  287.                 if (!flag) {  
  288.                     newTask.setValid(false);  
  289.                 }  
  290.             } else {  
  291.                 chain.doFilter(fatherTask, newTask, path, chain);  
  292.             }  
  293.         }  
  294.   
  295.     }  
  296.   
  297.     private class Root implements Filter {  
  298.   
  299.         @Override  
  300.         public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {  
  301.             if (path.startsWith("/")) {  
  302.                 newTask.init(fatherTask.getHost(), fatherTask.getPort(), path);  
  303.             } else {  
  304.                 chain.doFilter(fatherTask, newTask, path, chain);  
  305.             }  
  306.         }  
  307.   
  308.     }  
  309.   
  310.     private class Default implements Filter {  
  311.   
  312.         @Override  
  313.         public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {  
  314.             if (path.contains(":")) {  
  315.                 newTask.setValid(false);  
  316.                 return;  
  317.             }  
  318.             String prefix = getPrefix(fatherTask.getCurrentPath(), 1);  
  319.             newTask.init(fatherTask.getHost(), fatherTask.getPort(), prefix + "/" + path);  
  320.         }  
  321.     }  
  322.   
  323.     public ParseHandler(Task task, BlockingQueue connectlist, BlockingQueue persistencelist, List domainlist) {  
  324.         this.connectlist = connectlist;  
  325.         this.task = task;  
  326.         this.persistencelist = persistencelist;  
  327.         this.domainlist = domainlist;  
  328.     }  
  329.   
  330.     private Pattern pattern = Pattern.compile("\"[^\"]+\\.htm[^\"]*\"");  
  331.   
  332.     protected void handler() {  
  333.         try {  
  334.             parseTaskState(task);  
  335.             if (200 == task.getState()) {  
  336.                 Matcher matcher = pattern.matcher(task.getContent());  
  337.                 while (matcher.find()) {  
  338.                     String path = matcher.group();  
  339.                     if (!path.contains(" ") && !path.contains("\t") && !path.contains("(") && !path.contains(")")) {  
  340.                         path = path.substring(1, path.length() - 1);  
  341.   
  342.                         createNewTask(task, path);  
  343.                     }  
  344.                 }  
  345.             }  
  346.             task.dropContent();  
  347.             persistencelist.put(task);  
  348.         } catch (Exception e) {  
  349.             e.printStackTrace();  
  350.         }  
  351.     }  
  352.   
  353.     private void parseTaskState(Task task) {  
  354.         if (task.getContent().startsWith("HTTP/1.1")) {  
  355.             task.setState(Integer.parseInt(task.getContent().substring(912)));  
  356.         } else {  
  357.             task.setState(Integer.parseInt(task.getContent().substring(912)));  
  358.         }  
  359.     }  
  360.   
  361.     /**  
  362.      * @param fatherTask  
  363.      * @param path  
  364.      * @throws Exception  
  365.      */  
  366.     private void createNewTask(Task fatherTask, String path) throws Exception {  
  367.         Task newTask = new Task();  
  368.         FilterChain filterchain = new FilterChain();  
  369.         filterchain.doFilter(fatherTask, newTask, path, filterchain);  
  370.         if (newTask.isValid()) {  
  371.             synchronized (SET) {  
  372.                 if (SET.contains(newTask.getHost() + newTask.getCurrentPath())) {  
  373.                     return;  
  374.                 }  
  375.                 SET.add(newTask.getHost() + newTask.getCurrentPath());  
  376.             }  
  377.             connectlist.put(newTask);  
  378.         }  
  379.     }  
  380.   
  381.     private String getPrefix(String s, int count) {  
  382.         String prefix = s;  
  383.         while (count > 0) {  
  384.             prefix = prefix.substring(0, prefix.lastIndexOf("/"));  
  385.             count--;  
  386.         }  
  387.         return "".equals(prefix) ? "/" : prefix;  
  388.     }  
  389. }  
  390.   
  391. class Task {  
  392.     public Task() {  
  393.     }  
  394.   
  395.     public void init(String host, int port, String path) {  
  396.         this.setCurrentPath(path);  
  397.         this.host = host;  
  398.         this.port = port;  
  399.     }  
  400.   
  401.     public Task(String host, int port, String path) {  
  402.         init(host, port, path);  
  403.     }  
  404.   
  405.     private String host;  
  406.     private int port;  
  407.     private String currentPath;  
  408.     private long starttime;  
  409.     private long endtime;  
  410.   
  411.     public long getStarttime() {  
  412.         return starttime;  
  413.     }  
  414.   
  415.     public void setStarttime(long starttime) {  
  416.         this.starttime = starttime;  
  417.     }  
  418.   
  419.     public long getEndtime() {  
  420.         return endtime;  
  421.     }  
  422.   
  423.     public void setEndtime(long endtime) {  
  424.         this.endtime = endtime;  
  425.     }  
  426.   
  427.     private long taskTime;  
  428.     private String type;  
  429.     private StringBuilder content = new StringBuilder(2400);  
  430.     private int state;  
  431.     private boolean isValid = true;  
  432.   
  433.     public boolean isValid() {  
  434.         return isValid;  
  435.     }  
  436.   
  437.     public void setValid(boolean isValid) {  
  438.         this.isValid = isValid;  
  439.     }  
  440.   
  441.     public int getState() {  
  442.         return state;  
  443.     }  
  444.   
  445.     public void setState(int state) {  
  446.         this.state = state;  
  447.     }  
  448.   
  449.     public String getCurrentPath() {  
  450.         return currentPath;  
  451.     }  
  452.   
  453.     public void setCurrentPath(String currentPath) {  
  454.         this.currentPath = currentPath;  
  455.         int i = 0;  
  456.         if (currentPath.indexOf("?") != -1) {  
  457.             i = currentPath.indexOf("?");  
  458.         } else {  
  459.             if (currentPath.indexOf("#") != -1) {  
  460.                 i = currentPath.indexOf("#");  
  461.             } else {  
  462.                 i = currentPath.length();  
  463.             }  
  464.         }  
  465.         this.type = currentPath.substring(currentPath.indexOf(".") + 1, i);  
  466.     }  
  467.   
  468.     public long getTaskTime() {  
  469.         return getEndtime() - getStarttime();  
  470.     }  
  471.   
  472.     public String getType() {  
  473.         return type;  
  474.     }  
  475.   
  476.     public void setType(String type) {  
  477.         this.type = type;  
  478.     }  
  479.   
  480.     public String getHost() {  
  481.         return host;  
  482.     }  
  483.   
  484.     public int getPort() {  
  485.         return port;  
  486.     }  
  487.   
  488.     public String getContent() {  
  489.         return content.toString();  
  490.     }  
  491.   
  492.     public void dropContent() {  
  493.         this.content = null;  
  494.   
  495.     }  
  496.   
  497.     public void appendContent(String content) {  
  498.         this.content.append(content);  
  499.     }  
  500. }  
  501.   
  502. class PersistenceHandler implements Runnable {  
  503.     static {  
  504.         try {  
  505.             Class.forName("oracle.jdbc.OracleDriver");  
  506.         } catch (ClassNotFoundException e) {  
  507.             // TODO Auto-generated catch block  
  508.             e.printStackTrace();  
  509.         }  
  510.     }  
  511.   
  512.     public static int GETCOUNT() {  
  513.         return COUNT.get();  
  514.     }  
  515.   
  516.     private static final AtomicInteger COUNT = new AtomicInteger();  
  517.     private BlockingQueue persistencelist;  
  518.   
  519.     public PersistenceHandler(BlockingQueue persistencelist) {  
  520.         this.persistencelist = persistencelist;  
  521.         try {  
  522.             conn = DriverManager.getConnection("jdbc:oracle:thin:127.0.0.1:1521:orcl""edmond""edmond");  
  523.             ps = conn.prepareStatement(  
  524.                     "insert into probe(id,host,path,state,tasktime,type) values(seq_probe_id.nextval,?,?,?,?,?)");  
  525.         } catch (SQLException e) {  
  526.             // TODO Auto-generated catch block  
  527.             e.printStackTrace();  
  528.         }  
  529.     }  
  530.   
  531.     private Connection conn;  
  532.     private PreparedStatement ps;  
  533.   
  534.     @Override  
  535.     public void run() {  
  536.         while (true) {  
  537.             this.handler();  
  538.             COUNT.addAndGet(1);  
  539.         }  
  540.     }  
  541.   
  542.     private void handler() {  
  543.         try {  
  544.             Task task = (Task) persistencelist.take();  
  545.             ps.setString(1, task.getHost());  
  546.             ps.setString(2, task.getCurrentPath());  
  547.             ps.setInt(3, task.getState());  
  548.             ps.setLong(4, task.getTaskTime());  
  549.             ps.setString(5, task.getType());  
  550.   
  551.             ps.addBatch();  
  552.             if (GETCOUNT() % 500 == 0) {  
  553.                 ps.executeBatch();  
  554.                 conn.commit();  
  555.             }  
  556.         } catch (InterruptedException e) {  
  557.             e.printStackTrace();  
  558.         } catch (SQLException e) {  
  559.             e.printStackTrace();  
  560.         }  
  561.     }  
  562. }  

每秒可以爬170-200左右的網頁


因為這個速度受制於公司頻寬.


CPU也基本上跑滿了



這個程式還有優化的空間,主要是以下程式碼的阻塞和喚醒關係,還是沒有搞明白.
socketChannel.register(selector, SelectionKey.OP_READ, task);
int n = selector.select();
key.selector().wakeup();


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/29254281/viewspace-2134876/,如需轉載,請註明出處,否則將追究法律責任。

相關文章