map的输出,通过分区函数决定要发往哪个reducer。
有2种情况,我们自定义的Partitioner不会被调用
1) reducer个数为0
这种情况,没有reducer,不需要分区
2) reducer个数为1
这种情况,所有的map输出都会发到这个唯一的reducer,不需要调用我们的自定义reducer
hadoop源码
private class NewOutputCollector<K,V>extends org.apache.hadoop.mapreduce.RecordWriter<K,V> {private final MapOutputCollector<K,V> collector;private final org.apache.hadoop.mapreduce.Partitioner<K,V> partitioner;private final int partitions;@SuppressWarnings("unchecked")NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,JobConf job,TaskUmbilicalProtocol umbilical,TaskReporter reporter) throws IOException, ClassNotFoundException {collector = createSortingCollector(job, reporter);partitions = jobContext.getNumReduceTasks();if (partitions > 1) { // 总分区数(也就是reducer数量)大于1的时候,引用自定义Partitionerpartitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);} else { partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {@Overridepublic int getPartition(K key, V value, int numPartitions) {return partitions - 1;}};}}
}