map.py
#!/usr/bin/python
# encoding:utf-8
import sys

word2count = {}

for line in sys.stdin:
line = line.strip()
splited = line.split(',')[0]
if "捉妖记" in splited:
print '%s\t%s' % (splited, 1)
--------------------------------------------------------------------------------------------------
red.py
#!/usr/bin/python
# encoding:utf-8
from operator import itemgetter
import sys

word2count = {}

for line in sys.stdin:
line = line.strip()
word, count = line.split()

try:
count = int(count)
word2count[word] = word2count.get(word, 0) + count
except ValueError:
pass

#sorted_word2count = sorted(word2count.items(), lambda x, y: cmp(x[1], y[1]))
sorted_word2count = sorted(word2count.items(), key=itemgetter(0))
for word, count in sorted_word2count:
print '%s\t%s' % (word, count)

---------------------------------------------------------------------------------
数据:
dat0204.log
----------------------------------------------------------------------------------
运行脚本:(集群)
/usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar -input /home/hduser/dat0204.log -output /home/hduser/output -file /home/hduser/map.py -file /home/hduser/red.py -mapper "python map.py" -reducer "python red.py"

submit.sh($hadoop_home)
hadoop fs -rmr /home/hduser/ou*
/usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar \
-D mapred.reduce.tasks=1 \
-mapper "python map.py" \
-reducer "python red.py" \
-file /home/hduser/map.py \
-file /home/hduser/red.py \
-input /home/hduser/dat0204.log \
-output /home/hduser/output

hadoop fs -text /home/hduser/output/pa*

单机:
cat /home/hduser/dat0204.log |python map.py|sort|python red.py


---------------------------------------------------------------------------------------------------------------------------------------------------------
map.py
#!/usr/bin/python
# encoding:utf-8
import sys

word2count = {}

for line in sys.stdin:
line = line.strip()
splited = line.split(';')
for words in splited:
word = words.split('\t')[0]
if "道士下山" in word:
print '%s\t%s' % (word, 1)


red.py
#!/usr/bin/python
# encoding:utf-8
from operator import itemgetter
import sys

word2count = {}

for line in sys.stdin:
line = line.strip()
word, count = line.split()

try:
count = int(count)
word2count[word] = word2count.get(word, 0) + count
except ValueError:
pass

#sorted_word2count = sorted(word2count.items(), lambda x, y: cmp(x[1], y[1]))
sorted_word2count = sorted(word2count.items(), key=itemgetter(0))
for word, count in sorted_word2count:
print '%s\t%s' % (word, count)
---------------------------------------------------------------------------------
数据:
dat0203.log

运行脚本如上所示:
/usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar -input /home/hduser/dat0203.log -output /home/hduser/output -file /home/hduser/map.py -file /home/hduser/red.py -mapper "python map.py" -reducer "python red.py"

submit.sh
/usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar \
-input /home/hduser/dat0203.dat \
-output /home/hduser/output \
-file /home/hduser/map.py \
-file /home/hduser/red.py \
-mapper "python map.py" \
-reducer "python red.py" \
-jobconf mapred.reduce.tasks=1 \
-jobconf mapred.job.name="qianjc_test"

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐