pycurl實現hadoop的客戶端功能
目前在測試一個hadoop的功能,需要頻繁的和hadoop打交道。剛開始採用的python的subprocess模組來呼叫底層的hadoop提供的命令列工具實現的。
一,hadoop提供的命令列格式說明:
hadoop fs [cmd]具體的命令有:
hadoop fs [-fs <local | file system URI>] [-conf <configuration file>]
[-D <property=value>] [-ls <path>] [-lsr <path>] [-du <path>]
[-dus <path>] [-mv <src> <dst>] [-cp <src> <dst>] [-rm [-skipTrash] <src>]
[-rmr [-skipTrash] <src>] [-put <localsrc> … <dst>] [-copyFromLocal <localsrc> … <dst>]
[-moveFromLocal <localsrc> … <dst>] [-get [-ignoreCrc] [-crc] <src> <localdst>
[-getmerge <src> <localdst> [addnl]] [-cat <src>]
[-copyToLocal [-ignoreCrc] [-crc] <src> <localdst>] [-moveToLocal <src> <localdst>]
[-mkdir <path>] [-report] [-setrep [-R] [-w] <rep> <path/file>]
[-touchz <path>] [-test -[ezd] <path>] [-stat [format] <path>]
[-tail [-f] <path>] [-text <path>]
[-chmod [-R] <MODE[,MODE]… | OCTALMODE> PATH…]
[-chown [-R] [OWNER][:[GROUP]] PATH…]
[-chgrp [-R] GROUP PATH…]
[-count[-q] <path>]
[-help [cmd]]
b,重新命名目錄
#curl -i -X PUT http://192.168.0.112:50071/webhdfs/v1/test?op=RENAME&destination=/test1
- #!/usr/bin/env python
- # -*- encoding:utf-8 -*-
- """A library to access Hadoop HTTP REST API,
- make sure you hadoop cluster open the http access .
- """
- ```
- author : liran
- data : 2013-03-11
- 致謝:xwu
- 武漢雲雅科技有限公司
- ```
- import StringIO
- import pycurl
- import re
- import sys
- import logging
- import os
- class WebHadoop(object):
- def __init__(self,host,port,username,logger,prefix="/webhdfs/v1"):
- self.host = host
- self.port = port
- self.user = username
- self.logger = logger
- self.prefix = prefix
- self.status = None
- self.url = "http://%s:%s" % (host,port)
- selfself.url_path = self.url + self.prefix
- def checklink(self):
- try:
- b = StringIO.StringIO()
- c = pycurl.Curl()
- checkurl = self.url + "/dfsnodelist.jsp?whatNodes=LIVE"
- c.setopt(pycurl.URL, checkurl)
- c.setopt(pycurl.HTTPHEADER, ["Accept:"])
- c.setopt(pycurl.WRITEFUNCTION, b.write)
- c.setopt(pycurl.FOLLOWLOCATION, 1)
- c.setopt(pycurl.MAXREDIRS, 5)
- c.perform()
- self.status = c.getinfo(c.HTTP_CODE)
- bbody = b.getvalue()
- self.Write_Debug_Log(self.status,checkurl)
- p = re.compile(r```Live Datanodes :(.*)</a```)
- results = p.findall(body)
- b.close()
- if results[0] == "0":
- self.logger.error("Sorry, There are not live datanodes in Hadoop Cluster!!!")
- self.curlObj.close()
- sys.exit(255)
- return results[0]
- except pycurl.error,e:
- self.logger.error("Sorry, can not get the hadoop http link .Erros: %s" % e)
- c.close()
- b.close()
- sys.exit(255)
- finally:
- c.close()
- b.close()
- def lsdir(self,path):
- try:
- b = StringIO.StringIO()
- put_str = `[{"op":LISTSTATUS}]`
- c = pycurl.Curl()
- lsdir_url = self.url_path + path + "?op=LISTSTATUS"
- c.setopt(pycurl.URL, lsdir_url)
- c.setopt(pycurl.HTTPHEADER, ["Accept:"])
- c.setopt(pycurl.WRITEFUNCTION, b.write)
- c.setopt(pycurl.FOLLOWLOCATION, 1)
- c.setopt(pycurl.MAXREDIRS, 5)
- c.perform()
- bbody = b.getvalue()
- self.status = c.getinfo(c.HTTP_CODE)
- except Exception,e:
- print e
- finally:
- c.close()
- b.close()
- if self.status == 200:
- data_dir = eval(body)
- return data_dir[`FileStatuses`][`FileStatus`]
- else:
- self.logger.error("Sorry,can not list the dir or file status!!!")
- self.Write_Debug_Log(self.status,lsdir_url)
- return False
- def lsfile(self,path):
- try:
- c = pycurl.Curl()
- b = StringIO.StringIO()
- put_str = `[{"op":LISTSTATUS}]`
- lsdir_url = self.url_path + path + "?op=GETFILESTATUS"
- c.setopt(pycurl.URL, lsdir_url)
- c.setopt(pycurl.HTTPHEADER, ["Accept:"])
- c.setopt(pycurl.WRITEFUNCTION, b.write)
- c.setopt(pycurl.FOLLOWLOCATION, 1)
- c.setopt(pycurl.MAXREDIRS, 5)
- c.perform()
- bbody = b.getvalue()
- self.status = c.getinfo(c.HTTP_CODE)
- except Exception,e:
- print e
- finally:
- c.close()
- b.close()
- if self.status == 200:
- data_dir = eval(body)
- if data_dir[`FileStatus`][`type`] == "DIRECTORY":
- self.logger.error("Sorry,this file %s is a dir actually!!!" % (path))
- return False
- else:
- return data_dir[`FileStatus`]
- else:
- self.logger.error("Sorry,can not list the dir or file status!!!")
- self.Write_Debug_Log(self.status,lsdir_url)
- return False
- def mkdir(self,path,permission="755"):
- try:
- print "yes ,mkdir function"
- b = StringIO.StringIO()
- c = pycurl.Curl()
- mkdir_str = `[{"op":"MKDIRS","permission"=permission}]`
- mkdir_url = "%s%s?op=MKDIRS&permission=%s" % (self.url_path,path,permission)
- c.setopt(pycurl.URL, mkdir_url)
- c.setopt(pycurl.HTTPHEADER,[`Content-Type: application/json`,`Content-Length: `+str(len(mkdir_str))])
- c.setopt(pycurl.CUSTOMREQUEST,"PUT")
- c.setopt(pycurl.POSTFIELDS,mkdir_str)
- c.setopt(pycurl.WRITEFUNCTION, b.write)
- c.setopt(pycurl.FOLLOWLOCATION, 1)
- c.setopt(pycurl.MAXREDIRS, 5)
- c.perform()
- self.status = c.getinfo(c.HTTP_CODE)
- bbody = b.getvalue()
- b.close()
- except Exception,e:
- print e
- finally:
- c.close()
- if self.status == 200 :
- if "true" in body:
- self.logger.info("Great,Successfully Create dir %s in hadoop cluster!!" % (path))
- return True
- elif "false" in body:
- self.logger.info("Sorry,can`t create this %s dir in hadoop cluster!!1!!")
- return False
- else:
- return False
- else:
- self.logger.error("Sorry,can`t create this %s dir in hadoop cluster!!1" % (path))
- self.Write_Debug_Log(self.status,mkdir_url)
- def remove(self,path,recursive="True"):
- try:
- c = pycurl.Curl()
- b = StringIO.StringIO()
- remove_str = `[{"op":"DELETE","recursive"=recursive}]`
- remvoe_url = "%s%s?op=DELETE&recursive=%s" % (self.url_path,path,recursive)
- c.setopt(pycurl.URL, remvoe_url)
- c.setopt(pycurl.HTTPHEADER,[`Content-Type: application/json`,`Content-Length: `+str(len(remove_str))])
- c.setopt(pycurl.CUSTOMREQUEST,"DELETE")
- c.setopt(pycurl.POSTFIELDS,remove_str)
- c.setopt(pycurl.WRITEFUNCTION, b.write)
- c.setopt(pycurl.FOLLOWLOCATION, 1)
- c.setopt(pycurl.MAXREDIRS, 5)
- c.perform()
- bbody = b.getvalue()
- print type(body)
- self.status = c.getinfo(c.HTTP_CODE)
- except Exception,e:
- print e
- finally:
- c.close()
- b.close()
- if self.status == 200 :
- if "true" in body:
- print "yes ,it in"
- self.logger.info("Great,Successfully delete dir or file %s in hadoop cluster!!" % (path))
- return True
- elif "false" in body:
- print "no ,it is not"
- self.logger.info("Sorry,can`t delete dir or file,maybe this dir is not exsited!!")
- return False
- else:
- return False
- else:
- self.logger.error("Sorry,can`t create this %s dir in hadoop cluster!!1" % (path))
- self.Write_Debug_Log(self.status,remvoe_url)
- def rename(self,src,dst):
- try:
- c = pycurl.Curl()
- b = StringIO.StringIO()
- rename_str = `[{"op":"RENAME"}]`
- rename_url = "%s%s?op=RENAME&destination=%s" % (self.url_path,src,dst)
- c.setopt(pycurl.URL, rename_url)
- c.setopt(pycurl.HTTPHEADER,[`Content-Type: application/json`,`Content-Length: `+str(len(rename_str))])
- c.setopt(pycurl.CUSTOMREQUEST,"PUT")
- c.setopt(pycurl.POSTFIELDS,rename_str)
- c.setopt(pycurl.WRITEFUNCTION, b.write)
- c.setopt(pycurl.FOLLOWLOCATION, 1)
- c.setopt(pycurl.MAXREDIRS, 5)
- c.perform()
- bbody = b.getvalue()
- self.status = c.getinfo(c.HTTP_CODE)
- except Exception,e:
- print e
- finally:
- c.close()
- b.close()
- if self.status == 200 :
- if "true" in body:
- self.logger.info("Great,Successfully rename dir or file %s in hadoop cluster!!" % (rename_url))
- return True
- elif "false" in body:
- self.logger.info("Sorry,can`t rename dir or file,maybe this dir is not exsited!!")
- return False
- else:
- return False
- else:
- self.logger.error("Sorry,can`t create this %s dir in hadoop cluster!!1" % (rename_url))
- self.Write_Debug_Log(self.status,rename_url)
- def put_file(self,local_path,hdfs_path,overwrite="true",permission="755",buffersize="128"):
- print "yes ,put fils ing!!!"
- try:
- c = pycurl.Curl()
- put_str = `[{"op":"CREATE","overwrite":overwrite,"permission":permission,"buffersize":buffersize}]`
- put_url = "%s%s?op=CREATE&overwrite=%s&permission=%s&buffersize=%s" % (self.url_path,hdfs_path,overwrite,permission,buffersize)
- c.setopt(pycurl.URL, put_url)
- header_str = StringIO.StringIO()
- c.setopt(pycurl.HTTPHEADER,[`Content-Type: application/json`,`Content-Length: `+str(len(put_str))])
- c.setopt(pycurl.CUSTOMREQUEST,"PUT")
- c.setopt(pycurl.HEADER,1)
- c.setopt(pycurl.HEADERFUNCTION,header_str.write)
- c.setopt(pycurl.POSTFIELDS,put_str)
- b = StringIO.StringIO()
- c.setopt(pycurl.WRITEFUNCTION, b.write)
- c.setopt(pycurl.FOLLOWLOCATION, 1)
- c.setopt(pycurl.MAXREDIRS, 5)
- c.perform()
- redirect_url = c.getinfo(pycurl.EFFECTIVE_URL)
- except Exception,e:
- print e
- if os.path.isfile(local_path):
- try:
- f = file(local_path)
- filesize = os.path.getsize(local_path)
- c.setopt(pycurl.URL, redirect_url)
- c.setopt(pycurl.HEADER,1)
- c.setopt(pycurl.CUSTOMREQUEST,"PUT")
- c.setopt(pycurl.PUT,1)
- c.setopt(pycurl.INFILE,f)
- c.setopt(pycurl.INFILESIZE,filesize)
- c.setopt(pycurl.WRITEFUNCTION, b.write)
- c.setopt(pycurl.FOLLOWLOCATION, 1)
- c.setopt(pycurl.MAXREDIRS, 5)
- c.perform()
- print "yes.is ready to putting..."
- self.status = c.getinfo(c.HTTP_CODE)
- print b.getvalue()
- except Exception,e:
- print e
- finally:
- b.close()
- header_str.close()
- f.close()
- else:
- self.logger.error("Sorry,the %s is not existed,maybe it is not a file." % local_path)
- return False
- if self.status != 201:
- print self.status
- self.Write_Debug_Log(self.status,put_str)
- return False
- else:
- self.logger.info("Great,successfully put file into hdfs %s " % hdfs_path)
- return True
- def append(self,local_path,hdfs_path,buffersize=None):
- pass
- def get_file(self, local_path, hdfs_path,buffersize="128"):
- if not os.path.isfile(local_path):
- print local_path
- os.mknod(local_path)
- c = pycurl.Curl()
- f = file(local_path,`wb`)
- put_str = `[{"op":"OPEN"}]`
- put_url = "%s%s?op=OPEN&buffersize=%s" % (self.url_path,hdfs_path,buffersize)
- try:
- print "yes .aaaaaaaaaaaaaaaaaaaaa"
- c.setopt(pycurl.URL, put_url)
- c.setopt(pycurl.HTTPHEADER,[`Content-Type: application/json`,`Content-Length: `+str(len(put_str))])
- c.setopt(pycurl.CUSTOMREQUEST,"GET")
- f = file(local_path,`wb`)
- c.setopt(pycurl.POSTFIELDS,put_str)
- c.setopt(pycurl.WRITEFUNCTION,f.write)
- c.setopt(pycurl.FOLLOWLOCATION, 1)
- c.setopt(pycurl.MAXREDIRS, 5)
- c.setopt(pycurl.CONNECTTIMEOUT,60)
- c.setopt(pycurl.TIMEOUT,300)
- c.perform()
- print c.getinfo(pycurl.HTTP_CODE)
- self.status = c.getinfo(pycurl.HTTP_CODE)
- except Exception,e:
- print e
- finally:
- c.close()
- f.close()
- if self.status != 200:
- print self.status
- self.Write_Debug_Log(self.status,put_str)
- return False
- else:
- self.logger.info("Great,successfully put file into hdfs %s " % hdfs_path)
- return True
- def cat_file(self, hdfs_path,buffersize="128"):
- c = pycurl.Curl()
- b = StringIO.StringIO()
- put_str = `[{"op":"OPEN"}]`
- put_url = "%s%s?op=OPEN&buffersize=%s" % (self.url_path,hdfs_path,buffersize)
- try:
- print "yes .ready to open"
- c.setopt(pycurl.URL, put_url)
- c.setopt(pycurl.HTTPHEADER,[`Content-Type: application/json`,`Content-Length: `+str(len(put_str))])
- c.setopt(pycurl.CUSTOMREQUEST,"GET")
- c.setopt(pycurl.POSTFIELDS,put_str)
- c.setopt(pycurl.WRITEFUNCTION,b.write)
- c.setopt(pycurl.FOLLOWLOCATION, 1)
- c.setopt(pycurl.MAXREDIRS, 5)
- c.perform()
- self.status = c.getinfo(pycurl.HTTP_CODE)
- print c.getinfo(pycurl.HTTP_CODE)
- print "###-------------------------------------------###"
- print b.getvalue()
- except Exception,e:
- print e
- finally:
- c.close()
- b.close()
- if self.status != 200:
- print self.status
- self.Write_Debug_Log(self.status,put_str)
- return False
- else:
- self.logger.info("Great,successfully put file into hdfs %s " % hdfs_path)
- return True
- def copy_in_hdfs(self,src,dst,overwrite="true",permission="755",buffersize="128"):
- tmpfile = "/tmp/copy_inhdfs_tmpfile"
- self.get_file(tmpfile,src)
- if self.status == 200:
- self.put_file(tmpfile,dst,overwrite="true")
- if self.status == 201:
- os.remove(tmpfile)
- return True
- else:
- os.remove(tmpfile)
- return False
- else:
- os.remove(tmpfile)
- return False
- def Write_Debug_Log(self,status,url):
- if status != 200 or status != 201 :
- self.logger.error("Url : "%s" ,Exit code : %s"%(url,self.status))
- self.logger.error("fetch a error ,but don`t quit")
採用curl的方式實現的功能和java自帶的命令列工具比較,還是有些不足的
1,不支援hadoop內部檔案copy
2,不支援目錄上傳或者下載
3,測試的時候, shell的方式上傳,如果檔案已經存在回報錯;curl的方式上傳預設引數必須是overwrite=true,才能成功,不知道為什麼。
唯一的好處就是,執行的時間大大提高了。
同樣一個列出目錄列表的命令,
#time hadoop fs -ls hdfs://192.168.0.112:50081/
real 0m10.916s
user 0m4.082s
sys 0m6.799s
快了很多啊。類的程式碼還在繼續完善中。