1环境基于 上一篇搭建
高可用分布式集群
2 官方提供MapReduce程序
#评估圆周率
cd  /data/hadoop/share/hadoop/mapreduce/
hadoop  jar  hadoop-mapreduce-examples-3.4.0.jar  pi 2 6 
 
 
3 实例项目分析1
#预分析的文件如,如单词统计

#
#上传文件到hdfs
hdfs dfs  -put word.txt /test/01/
#可以先测试下,在运行计算
[root@master11 01]# cat word.txt   |  python  m.py |python r.py
foo	1
quux	1
labs	1
foo	1
bar	1
quux	1
good	1
six	1
good	1
foo	2
quux	1
labs	1
foo	1
bar	1
quux	1
good	1
six	1
good	1
foo	1
# hadoop jar /data/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar  -D stream.non.zero.exit.is.failure=false -mapper /data/test/01/m.py  -reducer /data/test/01/r.py   -input /test/01/  -output /test/output1/ 
#拷贝文件到服务器本地
hadoop fs -copyToLocal /test/output1/part-00000 /root/part-00000
#查看
[root@master11 ~]# cat  part-00000 
bar	2
foo	6
good	4
labs	2
quux	4
six	2[root@master11 01]# cat m.py 
#!/usr/bin/env python  import sys  for line in sys.stdin:  line = line.strip()  words = line.split()  for word in words:  print '%s\t%s' % (word, 1)
[root@master11 01]# cat r.py 
#!/usr/bin/env python  from operator import itemgetter  
import sys  current_word = None  
current_count = 0  
word = None  for line in sys.stdin:  line = line.strip()  word, count = line.split('\t', 1)  try:  count = int(count)  except ValueError:  continue  if current_word == word:  current_count += count  else:  if current_word:  print '%s\t%s' % (current_word, current_count)  current_count = count  current_word = word  if current_word == word:  print '%s\t%s' % (current_word, current_count)
4项目实战分析2,nginx日志 IP计数

hdfs dfs  -put t.log /test/01/
#执行计算
hadoop jar /data/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar  -D stream.non.zero.exit.is.failure=false -mapper /data/test/01/map.py  -reducer /data/test/01/red.py   -input /test/01/t.log  -output /test/output2/
#下载
hadoop fs -copyToLocal /test/output2/part-00000 /root/part-00000
#查看
 
 
[root@master11 01]# cat  map.py 
#!/usr/bin/python
import sys
import re
for line in sys.stdin:  ipaddress=re.compile(r'([\d.]*)')match=ipaddress.match(line)  if match:ip=match.group(1)print ' %s\t%s' % (ip, 1)
[root@master11 01]# cat red.py 
#!/usr/bin/python
#-*-coding:UTF-8 -*-
import sys
import os
import string
res = {}
for line in sys.stdin:skey=line[0:-1]if(res.has_key(skey)==False): res[skey]=0res[skey]=res[skey]+1
for key in res.keys():print key+"\t"+str(res[key])5 欢迎同学们一起交流