Linux企業級專案實踐之網路爬蟲(4)——主程式流程

尹成發表於2014-08-28

當我們設計好程式框架之後就要開始實現它了。第一步當然是要實現主程式的流程框架。之後我們逐漸填充每個流程的細節和其需要呼叫的模組。

 

主程式的流程如下:

1、  解析命令列引數,並根據引數跳轉到相應的處理分支

2、  解析配置檔案

3、  載入處理模組

4、  載入種子URL

5、  啟動抓取任務

 

主程式的程式碼如下:

int main(int argc, void *argv[])
{
   struct epoll_event events[10];
   int daemonized = 0;
   char ch;
 
   while ((ch = getopt(argc, (char* const*)argv, "vhd")) != -1) {
        switch(ch) {
            case 'v':
                version();
                break;
            case 'd':
                daemonized = 1;
                break;
            case 'h':
            case '?':
            default:
                usage();
        }
   }
 
   g_conf = initconfig();
   loadconfig(g_conf);
 
   set_nofile(1024);
 
   vector<char *>::iterator it = g_conf->modules.begin();
   for(; it != g_conf->modules.end(); it++) {
        dso_load(g_conf->module_path, *it);
   }
 
   if (g_conf->seeds == NULL) {
        SPIDER_LOG(SPIDER_LEVEL_ERROR, "Wehave no seeds, Buddy!");
   } else {
        int c = 0;
        char ** splits =strsplit(g_conf->seeds, ',', &c, 0);
        while (c--) {
            Surl * surl = (Surl*)malloc(sizeof(Surl));
            surl->url =url_normalized(strdup(splits[c]));
            surl->level = 0;
            surl->type = TYPE_HTML;
            if (surl->url != NULL)
                push_surlqueue(surl);
       }
   }       
 
   if (daemonized)
        daemonize();
 
   chdir("download");
 
   int err = -1;
   if ((err = create_thread(urlparser, NULL, NULL, NULL)) < 0) {
        SPIDER_LOG(SPIDER_LEVEL_ERROR,"Create urlparser thread fail: %s", strerror(err));
   }
 
   int try_num = 1;
   while(try_num < 8 && is_ourlqueue_empty())
        usleep((10000 << try_num++));
 
   if (try_num >= 8) {
        SPIDER_LOG(SPIDER_LEVEL_ERROR, "NOourl! DNS parse error?");
   }
 
   if (g_conf->stat_interval > 0) {
        signal(SIGALRM, stat);
        set_ticker(g_conf->stat_interval);
   }
 
   int ourl_num = 0;
   g_epfd = epoll_create(g_conf->max_job_num);
 
   while(ourl_num++ < g_conf->max_job_num) {
        if (attach_epoll_task() < 0)
            break;
   }
 
   int n, i;
   while(1) {
        n = epoll_wait(g_epfd, events, 10,2000);
        printf("epoll:%d\n",n);
        if (n == -1)
            printf("epollerrno:%s\n",strerror(errno));
        fflush(stdout);
 
        if (n <= 0) {
            if (g_cur_thread_num <= 0&& is_ourlqueue_empty() && is_surlqueue_empty()) {
                sleep(1);
                if (g_cur_thread_num <= 0&& is_ourlqueue_empty() && is_surlqueue_empty())
                    break;
            }
        }
 
        for (i = 0; i < n; i++) {
            evso_arg * arg = (evso_arg*)(events[i].data.ptr);
            if ((events[i].events &EPOLLERR) ||
                (events[i].events &EPOLLHUP) ||
                (!(events[i].events &EPOLLIN))) {
                SPIDER_LOG(SPIDER_LEVEL_WARN,"epoll fail, close socket %d",arg->fd);
                close(arg->fd);
                continue;
            }
            epoll_ctl(g_epfd, EPOLL_CTL_DEL,arg->fd, &events[i]); /* del event */
 
            printf("helloepoll:event=%d\n",events[i].events);
            fflush(stdout);
            create_thread(recv_response, arg,NULL, NULL);
        }
   }
 
   SPIDER_LOG(SPIDER_LEVEL_DEBUG, "Task done!");
   close(g_epfd);
   return 0;
}


相關文章