簡單網路爬蟲Ruby版
require 'open-uri'
require 'thread'
Mutex
# run it like this :
# ruby Crawl.rb 2 1000 http://www.pragprog.com
# regexp
$link_regexp = /href\=\"[^\"]*\"/
$email_regexp_1 = /mailto\:[^\@]*\@[^\"]*[\"]/ #mailto:xx@xxxx"
$email_regexp_2 = /[\>][^\<]*\@[^\>]*[\<]/ #>xx@xx<
$before_at = /[a-zA-Z0-9]+[_?a-zA-Z0-9]+/
$after_at = /[a-zA-Z]+[-?a-zA-Z]*\.+[a-zA-Z]+/
$email_regexp=/#{$before_at}\@#{$after_at}/ #xx@xx.xx
#ARGV
if ARGV==nil||ARGV.length<3
puts '-- Command --'
puts 'ruby Crawl.rb 2 1000 http://www.pragprog.com'
puts 'help: 2->max_depth, 1000->max_pages, http://www.pragprog.com->url'
exit(0)
end
$url=ARGV[2]
$max_depth=ARGV[0].to_i
$max_pages=ARGV[1].to_i
$fname='emails_md'+String($max_depth)+'_mp'+String($max_pages)+'_'+$url.gsub(/[\/\:]/,'_')+'.txt'
$fname_links='links_md'+String($max_depth)+'_mp'+String($max_pages)+'_'+$url.gsub(/[\/\:]/,'_')+'.txt'
$thread_num=10
$debug=false
$links_stack=[] #fifo #[[depth1,link1],[depth2,link2],[depth3,link3],...]
$links_crawled=[] #[url1,url2,url3,...]
$emails=[] #[email1,email2,email3,...]
class Crawl
def initialize url,depth
@url=url
while @url[-1,1]=='/'
@url=@url.slice(0,@url.length-1)
end
@depth=depth
begin
@html=open(@url).read
rescue
@html=''
end
end
def get_links
@html.scan($link_regexp) do |match|
u=Util.format_url(match,@url)
if !(u==nil)&&!$links_crawled.include?(match)&&$links_stack.rassoc(match)==nil
$links_stack.push [@depth,u]
end
end
end
def get_emails
@html.scan($email_regexp_1) do |match|
match=Util.format_email(match)
if match!=nil&&!$emails.include?(match)
$emails.push match
msg= match+', '+@url
puts msg
Util.write($fname,msg+"\r\n")
end
end
@html.scan($email_regexp_2) do |match|
match=Util.format_email(match)
if match!=nil&&!$emails.include?(match)
$emails.push match
msg= match+', '+@url
puts msg
Util.write($fname,msg+"\r\n")
end
end
end
end
class Util
# format url
def Util.format_url url,f_url
# remove 'www-'
f_url=f_url.gsub(/www\-/, '')
url=url[6,url.length-7]
# exclude css & js & '#'
if Util.exclude(url)==nil||url.include?('#')
return nil
end
# full path
if url[0,4]!='http'
while url.index('/')==0
url=url.slice(1,url.length-1)
end
url=f_url+'/'+url
end
return url
end
# format email
def Util.format_email email
email=email.delete('>').delete('<').delete('mailto:').delete('"').strip
if String($email_regexp.match(email))== email
return email.downcase
else
return nil
end
end
# write msg to file
def Util.write fname,msg
file=File.new(fname,'a')
file<<msg
file.close()
end
# exclude css & js...
def Util.exclude str
ex=['css','js','pdf','jpg']
ex.each do |e|
index=e.length+1
if str.length>index && str[-index,index]=='.'+e
return nil
end
end
return str
end
end
$count=1
0.upto($max_depth) do |i|
if $debug
puts '~~depth->'+String(i)
end
if i==0
c=Crawl.new($url,i+1)
c.get_links
c.get_emails
$links_crawled.push [i,$url]
end
#breadth first
while $links_stack.length!=0
if $debug
puts '~~count->'+String($count)+',stack->'+String($links_stack.length)+',crawled->'+String($links_crawled.length)+',total->'+String($links_crawled.length+$links_stack.length)
$count=$count+1
end
#Thread.abort_on_exception = true
threads = []
if $links_stack.length/$thread_num>=1
ts=$thread_num
else
ts=$links_stack.length%$thread_num
end
ts.times { |i|
threads << Thread.new(i) {
Mutex.new.synchronize {
if ($links_crawled.length+$links_stack.length)<=$max_pages&&i!=$max_depth
link=$links_stack.shift #fifo
if link[0]==i+1
#read links & emails from pages in stack
c=Crawl.new(link[1],i+2)
c.get_links
c.get_emails
$links_crawled.push link[1]
else
break
end
else
#only read emails from pages in stack
link=$links_stack.shift
c=Crawl.new(link[1],i+2)
c.get_emails
$links_crawled.push link[1]
end
}
}
}
threads.each{|t|t.join}
end
end
最後在命令終端輸入: Ruby C:/Ruby/Craw.rb 2 1000 www.pragprog.com即可
示例輸出:
C:\Users\wujianghua>ruby C:/Ruby_my/Crawl.rb 2 1000 http://www.pragprog.com
ses@prgprg.c, http://www.pragprog.com/frequently-asked-questions/shipping
suppr@prgprg.c, http://www.pragprog.com/privacy
che@prgprg.c, http://pragprog.com/magazines
bugfr@g.c, http://www.itworld.com/business/368665/book-review-healthy-programmer
szenpus@sshd.rg, http://books.slashdot.org/story/13/08/12/1239250/book-review-th
e-healthy-programmer
feedbck@sshd.rg, http://books.slashdot.org/story/13/08/12/1239250/book-review-th
e-healthy-programmer
secury@prgprg.c, http://pragprog.com/security
ses@prgcprgrer.c, http://pragprog.com/contact
suppr@prgcprgrer.c, http://pragprog.com/contact
prpss@prgcprgrer.c, http://pragprog.com/contact
dve@prgprg.c, http://pragprog.com/contact
ndy@prgprg.c, http://pragprog.com/contact
ndy@pednce.c, http://petdance.com
nf@nncngn.c, http://antoniocangiano.com
rck@exper.c, http://exampler.com/blog
chd@chdfwer.c, http://chadfowler.com
hwdy@eeecpues.c, http://eeecomputes.com
chrsphe@decus-nsghs.c, http://www.tddsworld.com
dddevjer@devj.c, http://devjam.com
cchng@devj.c, http://devjam.com
dvd@wrdsuppy.c, http://www.wordsupply.com
esher@esherderby.c, http://www.estherderby.com
cnc@hpncn.c, http://www.hamptoncatlin.com
hcn@vweb.c, http://www.hamptoncatlin.com
undees@g.c, http://www.ian.dees.name
jes@grysfnc.c, http://blog.grayproductions.net
nf@ngrsf.c, http://langrsoft.com
相關文章
- 爬蟲學習之一個簡單的網路爬蟲爬蟲
- 用PYTHON爬蟲簡單爬取網路小說Python爬蟲
- 最簡單的網路圖片的爬取 --Pyhon網路爬蟲與資訊獲取爬蟲
- 網路爬蟲爬蟲
- 基於python3的簡單網路爬蟲示例Python爬蟲
- 網路爬蟲——爬蟲實戰(一)爬蟲
- 簡單的爬蟲程式爬蟲
- 網路爬蟲精要爬蟲
- 網路爬蟲示例爬蟲
- python爬蟲:爬蟲的簡單介紹及requests模組的簡單使用Python爬蟲
- python簡單爬蟲(二)Python爬蟲
- 網路爬蟲的原理爬蟲
- 網路爬蟲專案爬蟲
- 傻傻的網路爬蟲爬蟲
- 簡單的爬蟲:爬取網站內容正文與圖片爬蟲網站
- Python3 | 簡單爬蟲分析網頁元素Python爬蟲網頁
- [Python] 網路爬蟲與資訊提取(1) 網路爬蟲之規則Python爬蟲
- 《用Python寫網路爬蟲》--編寫第一個網路爬蟲Python爬蟲
- 爬蟲--Scrapy簡易爬蟲爬蟲
- 簡單瞭解python爬蟲Python爬蟲
- Python簡單爬蟲專案Python爬蟲
- 簡單的Python爬蟲 就是這麼簡單Python爬蟲
- python網路爬蟲_Python爬蟲:30個小時搞定Python網路爬蟲視訊教程Python爬蟲
- python網路爬蟲應用_python網路爬蟲應用實戰Python爬蟲
- 什麼是Python網路爬蟲?常見的網路爬蟲有哪些?Python爬蟲
- python網路爬蟲(14)使用Scrapy搭建爬蟲框架Python爬蟲框架
- 爬蟲學習之基於Scrapy的網路爬蟲爬蟲
- python DHT網路爬蟲Python爬蟲
- 網路爬蟲的反扒策略爬蟲
- 什麼是網路爬蟲爬蟲
- 什麼是網路爬蟲?爬蟲
- 網路爬蟲是什麼?爬蟲
- 網路爬蟲如何運作?爬蟲
- 網路爬蟲流程總結爬蟲
- 網路爬蟲大型教程(二)爬蟲
- 網路爬蟲專案蒐集爬蟲
- 網路爬蟲三大特性爬蟲
- 網路爬蟲(六):實戰爬蟲