程式碼已經拷貝到了公司電腦的:
/Users/baidu/Documents/Data/Work/Code/Self/hadoop_mr_streaming_jobs
首先是主控指令碼 main.sh
呼叫的是 extract.py
然後發現寫的不太好。其中有一個combiner,可以看這裡:
https://blog.csdn.net/u010700335/article/details/72649186
streaming 指令碼的時候,是以管道為基礎的:
(5) Python指令碼
1
2
3
|
import sys for line in sys.stdin: .......
|
#!/usr/bin/env python import sys # maps words to their counts word2count = {} # input comes from STDIN (standard input) for line in sys.stdin: # remove leading and trailing whitespace line = line.strip() # split the line into words while removing any empty strings words = filter(lambda word: word, line.split()) # increase counters for word in words: # write the results to STDOUT (standard output); # what we output here will be the input for the # Reduce step, i.e. the input for reducer.py # # tab-delimited; the trivial word count is 1 print '%s\t%s' % (word, 1) #--------------------------------------------------------------------------------------------------------- #!/usr/bin/env python from operator import itemgetter import sys # maps words to their counts word2count = {} # input comes from STDIN for line in sys.stdin: # remove leading and trailing whitespace line = line.strip() # parse the input we got from mapper.py word, count = line.split() # convert count (currently a string) to int try: count = int(count) word2count[word] = word2count.get(word, 0) + count except ValueError: # count was not a number, so silently # ignore/discard this line pass # sort the words lexigraphically; # # this step is NOT required, we just do it so that our # final output will look more like the official Hadoop # word count examples sorted_word2count = sorted(word2count.items(), key=itemgetter(0)) # write the results to STDOUT (standard output) for word, count in sorted_word2count: print '%s\t%s'% (word, count)