1 系统、软件以及前提约束

CentOS-7 64



2 操作步骤



import sys

# input comes from STDIN (standard input)

for line in sys.stdin:

# remove leading and trailing whitespace

line = line.strip()

# split the line into words

words = line.split()

# increase counters

for word in words:

# write the results to STDOUT (standard output);

# what we output here will be the input for the

# Reduce step, i.e. the input for reducer.py


# tab-delimited; the trivial word count is 1

print ('%s\t%s' % (word, 1))


echo aa bb cc dd aa cc|python mapper.py





from operator import itemgetter

import sys

current_word = None

current_count = 0

word = None

# input comes from STDIN

for line in sys.stdin:

# remove leading and trailing whitespace

line = line.strip()

# parse the input we got from mapper.py

word, count = line.split('\t', 1)

# convert count (currently a string) to int


count = int(count)

except ValueError:

# count was not a number, so silently

# ignore/discard this line


# this IF-switch only works because Hadoop sorts map output

# by key (here: word) before it is passed to the reducer

if current_word == word:

current_count += count


if current_word:

# write result to STDOUT

print ('%s\t%s' % (current_word, current_count))

current_count = count

current_word = word

# do not forget to output the last word if needed!

if current_word == word:

print ('%s\t%s' % (current_word, current_count))


echo aa bb cc dd aa cc|python mapper.py|sort|python reducer.py




aa bb cc dd aa cc

aa bb cc dd aa cc

aa bb cc dd aa cc

aa bb cc dd aa cc

aa bb cc dd aa cc cc dd


hdfs dfs -mkdir /data

hdfs dfs -put info.txt /data/info


$HADOOP_HOME/bin/hadoop jar


-input "/data/*"

-output "/out99"

-mapper "python mapper.py"

-reducer "python reducer.py"

-file "/root/mapper.py"

-file "/root/reducer.py"



