longshilin
diff --git a/‎.classpath
+7 b/‎.classpath
+7
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎.project
+17 b/‎.project
+17
diff --git a/‎.settings/org.eclipse.core.resources.prefs
+2 b/‎.settings/org.eclipse.core.resources.prefs
+2
diff --git a/‎.settings/org.eclipse.jdt.core.prefs
+11 b/‎.settings/org.eclipse.jdt.core.prefs
+11
diff --git a/‎build/test/mapred/local/1512137341660/.tmp_dept.crc
12 Bytes b/‎build/test/mapred/local/1512137341660/.tmp_dept.crc
12 Bytes
diff --git a/‎build/test/mapred/local/1512138578137/.tmp_dept.crc
12 Bytes b/‎build/test/mapred/local/1512138578137/.tmp_dept.crc
12 Bytes
diff --git a/‎build/test/mapred/local/1512139052592/.tmp_dept.crc
12 Bytes b/‎build/test/mapred/local/1512139052592/.tmp_dept.crc
12 Bytes
diff --git a/‎build/test/mapred/local/1512139107777/.tmp_dept.crc
12 Bytes b/‎build/test/mapred/local/1512139107777/.tmp_dept.crc
12 Bytes
diff --git a/‎build/test/mapred/local/1512139442121/.tmp_dept.crc
12 Bytes b/‎build/test/mapred/local/1512139442121/.tmp_dept.crc
12 Bytes
diff --git a/‎build/test/mapred/local/1512139800549/.tmp_dept.crc
12 Bytes b/‎build/test/mapred/local/1512139800549/.tmp_dept.crc
12 Bytes
diff --git a/‎build/test/mapred/local/1512140181687/.tmp_dept.crc
12 Bytes b/‎build/test/mapred/local/1512140181687/.tmp_dept.crc
12 Bytes
diff --git a/‎build/test/mapred/local/1512141030703/.tmp_dept.crc
12 Bytes b/‎build/test/mapred/local/1512141030703/.tmp_dept.crc
12 Bytes
diff --git a/‎build/test/mapred/local/localRunner/elon/jobcache/job_local1542155107_0001/attempt_local1542155107_0001_m_000000_0/output/file.out
6 Bytes b/‎build/test/mapred/local/localRunner/elon/jobcache/job_local1542155107_0001/attempt_local1542155107_0001_m_000000_0/output/file.out
6 Bytes
diff --git a/‎build/test/mapred/local/localRunner/elon/jobcache/job_local1542155107_0001/attempt_local1542155107_0001_m_000000_0/output/file.out.index
32 Bytes b/‎build/test/mapred/local/localRunner/elon/jobcache/job_local1542155107_0001/attempt_local1542155107_0001_m_000000_0/output/file.out.index
32 Bytes
diff --git a/‎build/test/mapred/local/localRunner/elon/jobcache/job_local2073171171_0001/attempt_local2073171171_0001_m_000000_0/output/file.out
6 Bytes b/‎build/test/mapred/local/localRunner/elon/jobcache/job_local2073171171_0001/attempt_local2073171171_0001_m_000000_0/output/file.out
6 Bytes
diff --git a/‎build/test/mapred/local/localRunner/elon/jobcache/job_local2073171171_0001/attempt_local2073171171_0001_m_000000_0/output/file.out.index
32 Bytes b/‎build/test/mapred/local/localRunner/elon/jobcache/job_local2073171171_0001/attempt_local2073171171_0001_m_000000_0/output/file.out.index
32 Bytes
diff --git a/‎build/test/mapred/local/localRunner/elon/jobcache/job_local2137508878_0001/attempt_local2137508878_0001_m_000000_0/output/file.out
6 Bytes b/‎build/test/mapred/local/localRunner/elon/jobcache/job_local2137508878_0001/attempt_local2137508878_0001_m_000000_0/output/file.out
6 Bytes
diff --git a/‎build/test/mapred/local/localRunner/elon/jobcache/job_local2137508878_0001/attempt_local2137508878_0001_m_000000_0/output/file.out.index
32 Bytes b/‎build/test/mapred/local/localRunner/elon/jobcache/job_local2137508878_0001/attempt_local2137508878_0001_m_000000_0/output/file.out.index
32 Bytes
diff --git a/‎build/test/mapred/local/localRunner/elon/jobcache/job_local463997632_0001/attempt_local463997632_0001_m_000000_0/output/file.out
6 Bytes b/‎build/test/mapred/local/localRunner/elon/jobcache/job_local463997632_0001/attempt_local463997632_0001_m_000000_0/output/file.out
6 Bytes
diff --git a/‎build/test/mapred/local/localRunner/elon/jobcache/job_local463997632_0001/attempt_local463997632_0001_m_000000_0/output/file.out.index
32 Bytes b/‎build/test/mapred/local/localRunner/elon/jobcache/job_local463997632_0001/attempt_local463997632_0001_m_000000_0/output/file.out.index
32 Bytes
diff --git a/‎input/dept
+4 b/‎input/dept
+4
diff --git a/‎input/emp
+12 b/‎input/emp
+12
diff --git a/‎output/Q2/._SUCCESS.crc
8 Bytes b/‎output/Q2/._SUCCESS.crc
8 Bytes
diff --git a/‎output/Q2/.part-r-00000.crc
8 Bytes b/‎output/Q2/.part-r-00000.crc
8 Bytes
diff --git a/‎output/Q2/_SUCCESS b/‎output/Q2/_SUCCESS
diff --git a/‎output/Q2/part-r-00000 b/‎output/Q2/part-r-00000
diff --git a/‎src/README.md
+54 b/‎src/README.md
+54
diff --git a/‎src/com/elon33/mr1/Q10MiddlePersonsCountForComm.java
+151 b/‎src/com/elon33/mr1/Q10MiddlePersonsCountForComm.java
+151
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.USER_LIBRARY/hadoop-jar-2.4.1"/>
+	<classpathentry kind="output" path="bin"/>
+</classpath>
@@ -0,0 +1 @@
+/bin/
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>Hadoop-MR</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
@@ -0,0 +1,2 @@
+eclipse.preferences.version=1
+encoding//input/emp=GBK
@@ -0,0 +1,11 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=1.7
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.source=1.7
@@ -0,0 +1,4 @@
+10,ACCOUNTING,NEW YORK
+20,RESEARCH,DALLAS
+30,SALES,CHICAGO
+40,OPERATIONS,BOSTON
@@ -0,0 +1,12 @@
+7369,SMITH,CLERK,7902,17-12��-80,800,,20
+7499,ALLEN,SALESMAN,7698,20-2�� -81,1600,300,30
+7521,WARD,SALESMAN,7698,22-2�� -81,1250,500,30
+7566,JONES,MANAGER,7839,02-4�� -81,2975,,20
+7654,MARTIN,SALESMAN,7698,28-9�� -81,1250,1400,30
+7698,BLAKE,MANAGER,7839,01-5�� -81,2850,,30
+7782,CLARK,MANAGER,7839,09-6�� -81,2450,,10
+7839,KING,PRESIDENT,,17-11��-81,5000,,10
+7844,TURNER,SALESMAN,7698,08-9�� -81,1500,0,30
+7900,JAMES,CLERK,7698,03-12��-81,950,,30
+7902,FORD,ANALYST,7566,03-12��-81,3000,,20
+7934,MILLER,CLERK,7782,23-1�� -82,1300,,10
@@ -0,0 +1,54 @@
+# MapReduce应用案例
+
+## 环境说明
+Hadoop搭建环境：
+
+| 虚拟机操作系统： CentOS6.3  64位，单核，1G内存 
+| JDK：1.7.0_60 64位 
+| Hadoop：2.4.1
+
+MR程序编译环境：
+
+| Eclipse IED 
+| mapred.LocalJobRunner本地运行模式
+
+## 准备测试数据
+
+测试数据包括两个文件dept（部门）和emp（员工），其中各字段用逗号分隔：
+
+
+dept文件内容：
+
+	10,ACCOUNTING,NEW YORK
+	20,RESEARCH,DALLAS
+	30,SALES,CHICAGO
+	40,OPERATIONS,BOSTON
+emp文件内容：
+
+	7369,SMITH,CLERK,7902,17-12月-80,800,,20
+	7499,ALLEN,SALESMAN,7698,20-2月 -81,1600,300,30
+	7521,WARD,SALESMAN,7698,22-2月 -81,1250,500,30
+	7566,JONES,MANAGER,7839,02-4月 -81,2975,,20
+	7654,MARTIN,SALESMAN,7698,28-9月 -81,1250,1400,30
+	7698,BLAKE,MANAGER,7839,01-5月 -81,2850,,30
+	7782,CLARK,MANAGER,7839,09-6月 -81,2450,,10
+	7839,KING,PRESIDENT,,17-11月-81,5000,,10
+	7844,TURNER,SALESMAN,7698,08-9月 -81,1500,0,30
+	7900,JAMES,CLERK,7698,03-12月-81,950,,30
+	7902,FORD,ANALYST,7566,03-12月-81,3000,,20
+	7934,MILLER,CLERK,7782,23-1月 -82,1300,,10
+
+## 应用案例
+### 例子1：求各个部门的总工资
+#### 问题分析
+MapReduce中的join分为好几种，比如有最常见的 reduce side join、map side join和semi join 等。reduce join 在shuffle阶段要进行大量的数据传输，会造成大量的网络IO效率低下，而map side join 在处理多个小表关联大表时非常有用 。
+Map side join是针对以下场景进行的优化：两个待连接表中，有一个表非常大，而另一个表非常小，以至于小表可以直接存放到内存中。这样我们可以将小表复制多份，让每个map task内存中存在一份（比如存放到hash table中），然后只扫描大表：对于大表中的每一条记录key/value，在hash table中查找是否有相同的key的记录，如果有，则连接后输出即可。为了支持文件的复制，Hadoop提供了一个类DistributedCache，使用该类的方法如下：
+
+（1）用户使用静态方法`DistributedCache.addCacheFile()`指定要复制的文件，它的参数是文件的URI（如果是HDFS上的文件，可以这样：`hdfs://jobtracker:50030/home/XXX/file`）。JobTracker在作业启动之前会获取这个URI列表，并将相应的文件拷贝到各个TaskTracker的本地磁盘上。
+				
+（2）用户使用：在分布式环境`DistributedCache.getLocalCacheFiles()`/在伪分布式环境`DistributedCache.getCacheFiles()`方法获取文件目录，并使用标准的文件读写API读取相应的文件。
+在下面代码中，将会把数据量小的表(部门dept）缓存在内存中，在Mapper阶段对员工部门编号映射成部门名称，该名称作为key输出到Reduce中，在Reduce中计算按照部门计算各个部门的总工资。
+
+#### 处理流程图
+![求各个部门的总工资处理流程图](https://i.imgur.com/XpWCrvb.jpg)
+
@@ -0,0 +1,151 @@
+package com.elon33.mr1;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+public class Q10MiddlePersonsCountForComm extends Configured implements Tool {
+
+	public static class MapClass extends Mapper<LongWritable, Text, IntWritable, Text> {
+
+		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+
+			String[] kv = value.toString().split(",");
+
+			context.write(new IntWritable(0), new Text(kv[0] + "," + ("".equals(kv[3]) ? " " : kv[3])));
+		}
+	}
+
+	public static class Reduce extends Reducer<IntWritable, Text, NullWritable, Text> {
+
+		List<String> employeeList = new ArrayList<String>();
+		Map<String, String> employeeToManagerMap = new HashMap<String, String>();
+
+		public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
+
+			for (Text value : values) {
+				employeeList.add(value.toString().split(",")[0].trim());
+				employeeToManagerMap.put(value.toString().split(",")[0].trim(), value.toString().split(",")[1].trim());
+			}
+		}
+
+		@Override
+		protected void cleanup(Context context) throws IOException, InterruptedException {
+			int totalEmployee = employeeList.size();
+			int i, j;
+			int distance;
+			System.out.println(employeeList);
+			System.out.println(employeeToManagerMap);
+
+			for (i = 0; i < (totalEmployee - 1); i++) {
+				for (j = (i + 1); j < totalEmployee; j++) {
+					distance = calculateDistance(i, j);
+					String value = employeeList.get(i) + " and " + employeeList.get(j) + " = " + distance;
+					context.write(NullWritable.get(), new Text(value)); 
+				}
+			}
+		}
+
+		private int calculateDistance(int i, int j) {
+			String employeeA = employeeList.get(i);
+			String employeeB = employeeList.get(j);
+			int distance = 0;
+
+			if (employeeToManagerMap.get(employeeA).equals(employeeB) || employeeToManagerMap.get(employeeB).equals(employeeA)) {
+				distance = 0;
+			}
+			else if (employeeToManagerMap.get(employeeA).equals(employeeToManagerMap.get(employeeB))) {
+				distance = 0;
+			} else {
+				List<String> employeeA_ManagerList = new ArrayList<String>();
+				List<String> employeeB_ManagerList = new ArrayList<String>();
+
+				employeeA_ManagerList.add(employeeA);
+				String current = employeeA;
+				while (false == employeeToManagerMap.get(current).isEmpty()) {
+					current = employeeToManagerMap.get(current);
+					employeeA_ManagerList.add(current);
+				}
+
+				employeeB_ManagerList.add(employeeB);
+				current = employeeB;
+				while (false == employeeToManagerMap.get(current).isEmpty()) {
+					current = employeeToManagerMap.get(current);
+					employeeB_ManagerList.add(current);
+				}
+
+				int ii = 0, jj = 0;
+				String currentA_manager, currentB_manager;
+				boolean found = false;
+
+				for (ii = 0; ii < employeeA_ManagerList.size(); ii++) {
+					currentA_manager = employeeA_ManagerList.get(ii);
+					for (jj = 0; jj < employeeB_ManagerList.size(); jj++) {
+						currentB_manager = employeeB_ManagerList.get(jj);
+						if (currentA_manager.equals(currentB_manager)) {
+							found = true;
+							break;
+						}
+					}
+
+					if (found) {
+						break;
+					}
+				}
+
+				distance = ii + jj - 1;
+			}
+
+			return distance;
+		}
+	}
+
+	@Override
+	public int run(String[] args) throws Exception {
+
+		Job job = new Job(getConf(), "Q10MiddlePersonsCountForComm");
+		job.setJobName("Q10MiddlePersonsCountForComm");
+
+		job.setJarByClass(Q10MiddlePersonsCountForComm.class);
+		job.setMapperClass(MapClass.class);
+		job.setReducerClass(Reduce.class);
+
+		job.setMapOutputKeyClass(IntWritable.class);
+		job.setMapOutputValueClass(Text.class);
+
+		job.setOutputFormatClass(TextOutputFormat.class);
+		job.setOutputKeyClass(NullWritable.class);
+		job.setOutputValueClass(Text.class);
+
+		String[] otherArgs = new GenericOptionsParser(job.getConfiguration(), args).getRemainingArgs();
+		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
+		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
+
+		job.waitForCompletion(true);
+		return job.isSuccessful() ? 0 : 1;
+	}
+
+	public static void main(String[] args) throws Exception {
+		int res = ToolRunner.run(new Configuration(), new Q10MiddlePersonsCountForComm(), args);
+		System.exit(res);
+	}
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+eclipse.preferences.version=1`
	`2`	`+encoding//input/emp=GBK`