Hadoop Streaming统计电影出现的次数

map.py#!/usr/bin/python# encoding:utf-8import sysword2count = {}for line in sys.stdin:line = line.strip()splited = line.split(',')[0]if "捉妖记" in splited:print '%s\t%s' % (spl

浅汐王

1119人浏览 · 2017-10-07 16:09:50

浅汐王 · 2017-10-07 16:09:50 发布

  map.py
 
  #!/usr/bin/python
 
  # encoding:utf-8
 
  import sys
 
  word2count = {}
 
  for line in sys.stdin:
 
  line = line.strip()
 
  splited = line.split(',')[0]
 
  if "捉妖记" in splited:
 
  print '%s\t%s' % (splited, 1)
 
  --------------------------------------------------------------------------------------------------
 
  red.py
 
  #!/usr/bin/python
 
  # encoding:utf-8
 
  from operator import itemgetter
 
  import sys
 
  word2count = {}
 
  for line in sys.stdin:
 
  line = line.strip()
 
  word, count = line.split()
 
  try:
 
  count = int(count)
 
  word2count[word] = word2count.get(word, 0) + count
 
  except ValueError:
 
  pass
 
  #sorted_word2count = sorted(word2count.items(), lambda x, y: cmp(x[1], y[1]))
 
  sorted_word2count = sorted(word2count.items(), key=itemgetter(0))
 
  for word, count in sorted_word2count:
 
  print '%s\t%s' % (word, count)
 
  ---------------------------------------------------------------------------------
 
  数据：
 
  ----------------------------------------------------------------------------------
 
  运行脚本：（集群）
 
  /usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar -input /home/hduser/dat0204.log -output /home/hduser/output -file /home/hduser/map.py -file /home/hduser/red.py -mapper "python map.py" -reducer "python red.py"
 
  submit.sh（$hadoop_home）
 
  hadoop fs -rmr /home/hduser/ou*
 
  /usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar \
 
  -D mapred.reduce.tasks=1 \
 
  -mapper "python map.py" \
 
  -reducer "python red.py" \
 
  -file /home/hduser/map.py \
 
  -file /home/hduser/red.py \
 
  -input /home/hduser/dat0204.log \
 
  -output /home/hduser/output
 
  hadoop fs -text /home/hduser/output/pa*
 
  单机：
 
  cat /home/hduser/dat0204.log |python map.py|sort|python red.py
 
  ---------------------------------------------------------------------------------------------------------------------------------------------------------
 
  map.py
 
  #!/usr/bin/python
 
  # encoding:utf-8
 
  import sys
 
  word2count = {}
 
  for line in sys.stdin:
 
  line = line.strip()
 
  splited = line.split(';')
 
  for words in splited:
 
  word = words.split('\t')[0]
 
  if "道士下山" in word:
 
  print '%s\t%s' % (word, 1)
 
  red.py
 
  #!/usr/bin/python
 
  # encoding:utf-8
 
  from operator import itemgetter
 
  import sys
 
  word2count = {}
 
  for line in sys.stdin:
 
  line = line.strip()
 
  word, count = line.split()
 
  try:
 
  count = int(count)
 
  word2count[word] = word2count.get(word, 0) + count
 
  except ValueError:
 
  pass
 
  #sorted_word2count = sorted(word2count.items(), lambda x, y: cmp(x[1], y[1]))
 
  sorted_word2count = sorted(word2count.items(), key=itemgetter(0))
 
  for word, count in sorted_word2count:
 
  print '%s\t%s' % (word, count)
 
  ---------------------------------------------------------------------------------
 
  数据：
 
  运行脚本如上所示:
 
  /usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar -input /home/hduser/dat0203.log -output /home/hduser/output -file /home/hduser/map.py -file /home/hduser/red.py -mapper "python map.py" -reducer "python red.py"
 
  submit.sh
 
  /usr/local/hadoop/bin/hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar \
 
  -input /home/hduser/dat0203.dat \
 
  -output /home/hduser/output \
 
  -file /home/hduser/map.py \
 
  -file /home/hduser/red.py \
 
  -mapper "python map.py" \
 
  -reducer "python red.py" \
 
  -jobconf mapred.reduce.tasks=1 \
 
  -jobconf mapred.job.name="qianjc_test"