Hadoop Streaming统计电影出现的次数
map.py#!/usr/bin/python# encoding:utf-8import sysword2count = {}for line in sys.stdin:line = line.strip()splited = line.split(',')[0]if "捉妖记" in splited:print '%s\t%s' % (spl
·
map.py
#!/usr/bin/python
# encoding:utf-8
import sys
word2count = {}
for line in sys.stdin:
line = line.strip()
splited = line.split(',')[0]
if "捉妖记" in splited:
print '%s\t%s' % (splited, 1)
--------------------------------------------------------------------------------------------------
red.py
#!/usr/bin/python
# encoding:utf-8
from operator import itemgetter
import sys
word2count = {}
for line in sys.stdin:
line = line.strip()
word, count = line.split()
try:
count = int(count)
word2count[word] = word2count.get(word, 0) + count
except ValueError:
pass
#sorted_word2count = sorted(word2count.items(), lambda x, y: cmp(x[1], y[1]))
sorted_word2count = sorted(word2count.items(), key=itemgetter(0))
for word, count in sorted_word2count:
print '%s\t%s' % (word, count)
---------------------------------------------------------------------------------
数据:
----------------------------------------------------------------------------------
运行脚本:(集群)
/usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar -input /home/hduser/dat0204.log -output /home/hduser/output -file /home/hduser/map.py -file /home/hduser/red.py -mapper "python map.py" -reducer "python red.py"
submit.sh($hadoop_home)
hadoop fs -rmr /home/hduser/ou*
/usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar \
-D mapred.reduce.tasks=1 \
-mapper "python map.py" \
-reducer "python red.py" \
-file /home/hduser/map.py \
-file /home/hduser/red.py \
-input /home/hduser/dat0204.log \
-output /home/hduser/output
hadoop fs -text /home/hduser/output/pa*
单机:
cat /home/hduser/dat0204.log |python map.py|sort|python red.py
---------------------------------------------------------------------------------------------------------------------------------------------------------
map.py
#!/usr/bin/python
# encoding:utf-8
import sys
word2count = {}
for line in sys.stdin:
line = line.strip()
splited = line.split(';')
for words in splited:
word = words.split('\t')[0]
if "道士下山" in word:
print '%s\t%s' % (word, 1)
red.py
#!/usr/bin/python
# encoding:utf-8
from operator import itemgetter
import sys
word2count = {}
for line in sys.stdin:
line = line.strip()
word, count = line.split()
try:
count = int(count)
word2count[word] = word2count.get(word, 0) + count
except ValueError:
pass
#sorted_word2count = sorted(word2count.items(), lambda x, y: cmp(x[1], y[1]))
sorted_word2count = sorted(word2count.items(), key=itemgetter(0))
for word, count in sorted_word2count:
print '%s\t%s' % (word, count)
---------------------------------------------------------------------------------
数据:
运行脚本如上所示:
/usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar -input /home/hduser/dat0203.log -output /home/hduser/output -file /home/hduser/map.py -file /home/hduser/red.py -mapper "python map.py" -reducer "python red.py"
submit.sh
/usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar \
-input /home/hduser/dat0203.dat \
-output /home/hduser/output \
-file /home/hduser/map.py \
-file /home/hduser/red.py \
-mapper "python map.py" \
-reducer "python red.py" \
-jobconf mapred.reduce.tasks=1 \
-jobconf mapred.job.name="qianjc_test"
更多推荐
已为社区贡献5条内容
所有评论(0)