Commit 059fd7469c31485853bbab4a227c637760654cc7

solved all bugs,update the result shot
100queryEmailResult.png
(63 / 0)
Binary files differ
Screen Shot 2015-12-03 at 9.58.47 PM.png
(0 / 68)
Binary files differ
if_query_email_from_data1-5.png
(360 / 0)
Binary files differ
query_email_from_data1-5.png
(0 / 870)
Binary files differ
week2/src/main/java/BigData/week2/Main.java
(230 / 184)
  
11package BigData.week2;
22
3import java.io.File;
43import java.io.IOException;
54import java.util.ArrayList;
6import java.util.Arrays;
7import java.util.Collections;
85import java.util.HashMap;
96import java.util.List;
107import java.util.Map;
118import java.util.Map.Entry;
129
13import org.jfree.data.xy.XYSeries;
14import org.jfree.ui.RefineryUtilities;
10public class Main {
1511
16import com.google.common.collect.ArrayListMultimap;
17import com.google.common.collect.Multimap;
12 public static void main(String[] args) throws IOException {
1813
19import BigData.week2.XYSeriesDemo;
20import BigData.week2.errorCalculation;
14 // part one starts
2115
22public class Main {
16 // // read training spam email data
17 // String folderPathOfSpamTraning =
18 // "/Users/yang/Downloads/study/BigData/week3_test/spamtest2/spam/";
19 // // read training ham email data
20 // String folderPathOfHamTraning =
21 // "/Users/yang/Downloads/study/BigData/week3_test/hamtest2/ham/";
22 // // read query email
23 // String folderPathofQuery =
24 // "/Users/yang/Downloads/study/BigData/week3_test/query2/query/";
25 // //part one starts
26 // double
27 // disQueryAndSpam=calculateDistance.calculateDis(folderPathOfSpamTraning,folderPathofQuery);
28 // double
29 // disQueryAndHam=calculateDistance.calculateDis(folderPathOfHamTraning,folderPathofQuery);
30 //
31 // if(disQueryAndHam<disQueryAndSpam){
32 // System.out.println("part 1:This query email is most likely to be a ham");
33 // }else{
34 // System.out.println("part 1:This query email is most likely to be a spam");
35 // }
36 // System.out.println("-1--disQueryAndSpam ----"+disQueryAndSpam);
37 //
38 // System.out.println("-1--disQueryAndHam ----"+disQueryAndHam );
2339
24 public static void main(String[] args) throws IOException {
25
26 //part one starts
27
28// // read training spam email data
29// String folderPathOfSpamTraning =
30// "/Users/yang/Downloads/study/BigData/week3_test/spamtest2/spam/";
31// // read training ham email data
32// String folderPathOfHamTraning =
33// "/Users/yang/Downloads/study/BigData/week3_test/hamtest2/ham/";
34// // read query email
35// String folderPathofQuery =
36// "/Users/yang/Downloads/study/BigData/week3_test/query2/query/";
37// //part one starts
38// double
39// disQueryAndSpam=calculateDistance.calculateDis(folderPathOfSpamTraning,folderPathofQuery);
40// double
41// disQueryAndHam=calculateDistance.calculateDis(folderPathOfHamTraning,folderPathofQuery);
42//
43// if(disQueryAndHam<disQueryAndSpam){
44// System.out.println("part 1:This query email is most likely to be a ham");
45// }else{
46// System.out.println("part 1:This query email is most likely to be a spam");
47// }
48// System.out.println("-1--disQueryAndSpam ----"+disQueryAndSpam);
49//
50// System.out.println("-1--disQueryAndHam ----"+disQueryAndHam );
51
52 // part one ends
53
54
55
40 // part one ends
5641
5742 // part three starts
58// // read training spam email data
59// String folderPathOfSpamTraning3 = "/Users/yang/Downloads/study/BigData/week3_data/spamtest2/";
60// // read training ham email data
61// String folderPathOfHamTraning3 = "/Users/yang/Downloads/study/BigData/week3_data/hamtest2/";
62// // read query email
63// String folderPathofQuery3 = "/Users/yang/Downloads/study/BigData/week3_data/merge100test2/";
64//
65// Map<String,Double> QueryAndRealDistance = realDistance.getRealDistance(
66// folderPathOfSpamTraning3, folderPathOfHamTraning3,
67// folderPathofQuery3);
68// Map<Integer, Float> erroTable = new HashMap<Integer, Float>();
69//
70// for (int h = 0; h < 7; h++) {
71// int hyperpalinAmount3 = (int) Math.pow(2, h);// change seeds here
72// // [1, 2, 4, 8, 16,
73// // 32,64]
74// List<Integer> seedList3 = new ArrayList<Integer>();// 10 seeds for
75// // hyerpain
76// for (int i = 0; i < hyperpalinAmount3; i++) {
77// int randomNum = 62645 + (int) (Math.random() * 423564178);
78// if (!seedList3.contains(randomNum)) {
79// seedList3.add(randomNum);
80// }
81// }
82//
83//
84// Map<String,List<String>> QueryAndClosestDistance = queryWithCloesestEmailSet.hyperplainMaker(
85// folderPathOfSpamTraning3, folderPathOfHamTraning3,
86// folderPathofQuery3, seedList3);
87//
88// Map<String,Double> realDisWithClosestEmailByHyper=new HashMap<String,Double>();
89// for (Entry<String, List<String>> ecd : QueryAndClosestDistance.entrySet())
90// {
91//
92//
93//
94// double haha=0;
95// List<Double> minDis =new ArrayList<Double>();
96// for(int i=0;i<ecd.getValue().size();i++)
97// {
98// haha=realDistanceBetweenTwoemail.getDistance(ecd.getKey(), ecd.getValue().get(i)).get(ecd.getKey());
99// System.out.println("haha:" + haha);
100// minDis.add(haha);
101//
102// }
103// double min=0;
104// if(ecd.getValue().size()==0){ min=0;}else{min=Collections.min(minDis);}
105//
106// realDisWithClosestEmailByHyper.put(ecd.getKey(), min);
107//
108// }
109// for (Entry<String, Double> rdceb : realDisWithClosestEmailByHyper.entrySet())
110// {
111// float erroN = (float) Math.abs(rdceb.getValue()-QueryAndRealDistance.get(rdceb.getKey()));
112// erroTable.put(hyperpalinAmount3, erroN);
113// }
114//
115//
116// }
117// for (java.util.Map.Entry<Integer, Float> e : erroTable.entrySet()) {
118// // to get key
119// e.getKey();
120// // and to get value
121// e.getValue();
122//
123// System.out.println(e.getKey() + "," + e.getValue());
124// }
125//
126// // draw a chart
127// final XYSeries series = new XYSeries("error in n hyperpalins");
128//
129// for (int i = 0; i < erroTable.size(); i++) {
130//
131// series.add((int) Math.pow(2, i),
132// erroTable.get((int) Math.pow(2, i)));
133//
134// final XYSeriesDemo demo = new XYSeriesDemo("error in n hyperpalins",
135// series);
136// demo.pack();
137// RefineryUtilities.centerFrameOnScreen(demo);
138// demo.setVisible(true);
139
43 // // read training spam email data
44 // String folderPathOfSpamTraning3 =
45 // "/Users/yang/Downloads/study/BigData/week3_data/spamtest2/";
46 // // read training ham email data
47 // String folderPathOfHamTraning3 =
48 // "/Users/yang/Downloads/study/BigData/week3_data/hamtest2/";
49 // // read query email
50 // String folderPathofQuery3 =
51 // "/Users/yang/Downloads/study/BigData/week3_data/merge100test2/";
52 //
53 // Map<String,Double> QueryAndRealDistance =
54 // realDistance.getRealDistance(
55 // folderPathOfSpamTraning3, folderPathOfHamTraning3,
56 // folderPathofQuery3);
57 // Map<Integer, Float> erroTable = new HashMap<Integer, Float>();
58 //
59 // for (int h = 0; h < 7; h++) {
60 // int hyperpalinAmount3 = (int) Math.pow(2, h);// change seeds here
61 // // [1, 2, 4, 8, 16,
62 // // 32,64]
63 // List<Integer> seedList3 = new ArrayList<Integer>();// 10 seeds for
64 // // hyerpain
65 // for (int i = 0; i < hyperpalinAmount3; i++) {
66 // int randomNum = 62645 + (int) (Math.random() * 423564178);
67 // if (!seedList3.contains(randomNum)) {
68 // seedList3.add(randomNum);
69 // }
70 // }
71 //
72 //
73 // Map<String,List<String>> QueryAndClosestDistance =
74 // queryWithCloesestEmailSet.hyperplainMaker(
75 // folderPathOfSpamTraning3, folderPathOfHamTraning3,
76 // folderPathofQuery3, seedList3);
77 //
78 // Map<String,Double> realDisWithClosestEmailByHyper=new
79 // HashMap<String,Double>();
80 // for (Entry<String, List<String>> ecd :
81 // QueryAndClosestDistance.entrySet())
82 // {
83 //
84 //
85 //
86 // double haha=0;
87 // List<Double> minDis =new ArrayList<Double>();
88 // for(int i=0;i<ecd.getValue().size();i++)
89 // {
90 // haha=realDistanceBetweenTwoemail.getDistance(ecd.getKey(),
91 // ecd.getValue().get(i)).get(ecd.getKey());
92 // System.out.println("haha:" + haha);
93 // minDis.add(haha);
94 //
95 // }
96 // double min=0;
97 // if(ecd.getValue().size()==0){
98 // min=0;}else{min=Collections.min(minDis);}
99 //
100 // realDisWithClosestEmailByHyper.put(ecd.getKey(), min);
101 //
102 // }
103 // for (Entry<String, Double> rdceb :
104 // realDisWithClosestEmailByHyper.entrySet())
105 // {
106 // float erroN = (float)
107 // Math.abs(rdceb.getValue()-QueryAndRealDistance.get(rdceb.getKey()));
108 // erroTable.put(hyperpalinAmount3, erroN);
109 // }
110 //
111 //
112 // }
113 // for (java.util.Map.Entry<Integer, Float> e : erroTable.entrySet()) {
114 // // to get key
115 // e.getKey();
116 // // and to get value
117 // e.getValue();
118 //
119 // System.out.println(e.getKey() + "," + e.getValue());
120 // }
121 //
122 // // draw a chart
123 // final XYSeries series = new XYSeries("error in n hyperpalins");
124 //
125 // for (int i = 0; i < erroTable.size(); i++) {
126 //
127 // series.add((int) Math.pow(2, i),
128 // erroTable.get((int) Math.pow(2, i)));
129 //
130 // final XYSeriesDemo demo = new XYSeriesDemo("error in n hyperpalins",
131 // series);
132 // demo.pack();
133 // RefineryUtilities.centerFrameOnScreen(demo);
134 // demo.setVisible(true);
135
140136 // part three ends
141
137
142138 // part four starts
143139 // read training spam email data
144 String folderPathOfSpamTraning3 = "/Users/yang/Downloads/study/BigData/week3_data/spam/";
140 String folderPathOfSpamTraning3 = "/Users/yang/Downloads/study/BigData/week3_data/spamtest2/";
145141 // read training ham email data
146 String folderPathOfHamTraning3 = "/Users/yang/Downloads/study/BigData/week3_data/ham/";
142 String folderPathOfHamTraning3 = "/Users/yang/Downloads/study/BigData/week3_data/hamtest2/";
147143 // read query email
148144 String folderPathofQuery3 = "/Users/yang/Downloads/study/BigData/week3_data/merge100test3/";
149 int hamMarkedAsHam=0;
150 int hamMarkedAsSpam=0;
151 int spamMarkedAsSpam=0;
152 int spamMarkedAsHam=0;
153 Map<String,Map<String,Double>> judageResult=new HashMap<String,Map<String,Double>>() ;
154 Map<String,List<String>> merge2=new HashMap<String,List<String>>();
155 Map<String,List<String>> merge=new HashMap<String,List<String>>();
156
157 int j=0;
158 for (int h = 0; h < 4; h++) {
159 //System.out.println("----h---"+h);
160 int hyperpalinAmount3 = (int) Math.pow(2, 4);// change seed's number here
161
145 int hamMarkedAsHam = 0;
146 int hamMarkedAsSpam = 0;
147 int spamMarkedAsSpam = 0;
148 int spamMarkedAsHam = 0;
149 Map<String, Map<String, Double>> judageResult = new HashMap<String, Map<String, Double>>();
150 Map<String, List<String>> merge2 = new HashMap<String, List<String>>();
151 Map<String, List<String>> merge = new HashMap<String, List<String>>();
152 List<String> QueryEmailsCannotFindSameSignature= new ArrayList<String>();
153
154 int j = 0;
155 for (int h = 0; h < 1; h++) {
156 // System.out.println("----h---"+h);
157 int hyperpalinAmount3 = (int) Math.pow(2, 4);// change seed's number
158 // here
159
162160 List<Integer> seedList3 = new ArrayList<Integer>();
163
161
164162 for (int i = 0; i < hyperpalinAmount3; i++) {
165163 int randomNum = 62645 + (int) (Math.random() * 423564178);
166164 if (!seedList3.contains(randomNum)) {
167165 seedList3.add(randomNum);
168166 }
169167 }
168
169 // calculate
170
171 Map<String, List<String>> QueryAndClosestDistance = queryWithCloesestEmailSet
172 .hyperplainMaker(folderPathOfSpamTraning3,
173 folderPathOfHamTraning3, folderPathofQuery3,
174 seedList3);
170175
171 //calculate
172
173 Map<String,List<String>> QueryAndClosestDistance = queryWithCloesestEmailSet.hyperplainMaker(
174 folderPathOfSpamTraning3, folderPathOfHamTraning3,
175 folderPathofQuery3, seedList3);
176 for (Entry<String, List<String>> ecd : QueryAndClosestDistance.entrySet())
177 {
178 List<String> mideleList=new ArrayList<String>();
179 if(j==0){merge2.put(ecd.getKey(),mideleList);}
180 merge2.get(ecd.getKey()).addAll(ecd.getValue());
176 System.out.println("QueryAndClosestDistance.size()"+QueryAndClosestDistance.size());
177 for (Entry<String, List<String>> ecd : QueryAndClosestDistance
178 .entrySet()) {
179 List<String> mideleList = new ArrayList<String>();
180 if (j == 0) {
181 merge2.put(ecd.getKey(), mideleList);
182 }
183 merge2.get(ecd.getKey()).addAll(ecd.getValue());
181184 }
182 j++;
185 j++;
183186 }
184 for(Entry<String, List<String>> ecd : merge2.entrySet()) {
185
186 if(ecd.getValue().size()!=0){
187
188 System.out.println("merge2.size()"+merge2.size());
189 for (Entry<String, List<String>> ecd : merge2.entrySet()) {
190
191 if (ecd.getValue().size() != 0) {
187192 merge.put(ecd.getKey(), ecd.getValue());
193 }else{QueryEmailsCannotFindSameSignature.add(ecd.getKey());}
194 }
195 System.out.println("QueryEmailsCannotFindSameSignature"+QueryEmailsCannotFindSameSignature.size());
196 System.out.println("merge.size()"+merge.size());
197
198
199
200 for (Entry<String, List<String>> m : merge.entrySet()) {
201 Map<String, Double> minDis = new HashMap<String, Double>();
202 for (int i = 0; i < m.getValue().size(); i++) {
203 double distanceBetweenTwo = realDistanceBetweenTwoemail
204 .getDistanceWithEmailName(m.getKey(),
205 m.getValue().get(i)).get(m.getValue().get(i));
206 minDis.put(m.getValue().get(i), distanceBetweenTwo);
188207 }
208 if (m.getValue().size() == 0) {
209 minDis.put("ham", Math.PI / 2);
210 //System.out.println("1");
211 }
212 //System.out.println("minDis"+minDis);
213 judageResult.put(m.getKey(), minDis);
214 //System.out.println("--judageResult"+judageResult);
215
189216 }
190 Map<String,Double> minDis =new HashMap<String,Double>();
191 for (Entry<String, List<String>> m : merge.entrySet())
192 {
193 double haha=0;
194 for(int i=0;i<m.getValue().size();i++)
195 {
196 haha=realDistanceBetweenTwoemail.getDistanceWithEmailName(m.getKey(), m.getValue().get(i)).get(m.getValue().get(i));
197 minDis.put(m.getValue().get(i),haha);
217 //System.out.println("judageResult.size()"+judageResult.size());
218 for (Entry<String, Map<String, Double>> jr : judageResult.entrySet()) {
219
220 String keyWithLowerestvalue = null;
221 // System.out.println("jr.getValue()" + jr.getValue());
222 // keyWithLowerestvalue = Collections
223 // .min(jr.getValue().entrySet(),
224 // (entry1, entry2) -> entry1.getValue() < entry2
225 // .getValue() ? -1 : 1).getKey();
226 // double minDS= Collections.min(jr.getValue().values());
227 // for (Entry<String, Double> jrv : jr.getValue().entrySet()) {
228 // keyWithLowerestvalue=jr.getValue();
229 //
230 // }
231
232 double lowestDistance = Double.MAX_VALUE;
233 for (Entry<String, Double> b : jr.getValue().entrySet()) {
234 Double distance = b.getValue();
235 if (distance < lowestDistance) {
236 lowestDistance = distance;
237 keyWithLowerestvalue = b.getKey();
198238 }
199 if(m.getValue().size()==0){minDis.put("ham.txt", Math.PI/2);}
200 judageResult.put(m.getKey(),minDis);
239 }
240
241 // System.out.println("query" + jr.getKey());
242 //System.out.println("non-query" + keyWithLowerestvalue);
243 if (jr.getKey().contains("ham")
244 && keyWithLowerestvalue.contains("ham")) {
245 hamMarkedAsHam++;
246 } else if (jr.getKey().contains("ham")
247 && keyWithLowerestvalue.contains("spam")) {
248 hamMarkedAsSpam++;
249 } else if (jr.getKey().contains("spam")
250 && keyWithLowerestvalue.contains("spam")) {
251 spamMarkedAsSpam++;
252 } else if (jr.getKey().contains("spam")
253 && keyWithLowerestvalue.contains("ham")) {
254 spamMarkedAsHam++;
255 }
201256 }
202 for(Entry<String, Map<String, Double>> jr : judageResult.entrySet())
203 {
204
205 String keyWithLowerestvalue=null;
206 keyWithLowerestvalue=Collections.min(jr.getValue().entrySet(), (entry1, entry2) -> entry1.getValue() < entry2.getValue() ? -1: 1).getKey();
207 if(jr.getKey().contains("ham.txt")&&keyWithLowerestvalue.contains("ham.txt")){hamMarkedAsHam++;}
208 else if(jr.getKey().contains("ham.txt")&&keyWithLowerestvalue.contains("spam.txt")){hamMarkedAsSpam++;}
209 else if(jr.getKey().contains("spam.txt")&&keyWithLowerestvalue.contains("spam.txt")){spamMarkedAsSpam++;}
210 else if(jr.getKey().contains("spam.txt")&&keyWithLowerestvalue.contains("ham.txt")){spamMarkedAsHam++;}
211 }
212
213 System.out.println("hamMarkedAsHam"+hamMarkedAsHam);
214 System.out.println("hamMarkedAsSpam"+hamMarkedAsSpam);
215 System.out.println("spamMarkedAsSpam"+spamMarkedAsSpam);
216 System.out.println("spamMarkedAsHam"+spamMarkedAsHam);
217
218 // part four ends
219
257
258 System.out.println("hamMarkedAsHam" + hamMarkedAsHam);
259 System.out.println("hamMarkedAsSpam" + hamMarkedAsSpam);
260 System.out.println("spamMarkedAsSpam" + spamMarkedAsSpam);
261 System.out.println("spamMarkedAsHam" + spamMarkedAsHam);
262 System.out.println("someOtherEmailCannotFindAnyEmailWithSameSignature" + QueryEmailsCannotFindSameSignature.size());
263
264 // part four ends
265
220266 System.out.println("---main done ----");
221267
222268 }
223269
224 }
270}
week2/src/main/java/BigData/week2/emailConvert.java
(1 / 1)
  
1313
1414 final Map<String, Integer> oneVector = new HashMap<String, Integer>();
1515 String cvsSplitBy3 = " ";
16 email = email.substring(8); //get rid of the 'Subject:' part
16 //email = email.substring(8); //get rid of the 'Subject:' part
1717 String[] emailPart = email.split(cvsSplitBy3);
1818 //int emailSize=emailPart.length;
1919 for(int s=0;s<emailPart.length;s++){
week2/src/main/java/BigData/week2/hyperplainVector.java
(32 / 23)
  
44import java.util.HashMap;
55import java.util.List;
66import java.util.Map;
7import java.util.Vector;
87import java.util.Map.Entry;
8import java.util.Vector;
99
1010import com.google.common.hash.HashCode;
1111import com.google.common.hash.HashFunction;
1212import com.google.common.hash.Hashing;
1313
1414public class hyperplainVector {
15
16 public static Map<String, Vector<Integer>> gethyperVectors(Map<String,Map<String, Integer>> vectorsOfHam,List<Integer> seedList){
17
18 Map<String,Vector<Integer>> allHamVectorInHyperplain = new HashMap<String,Vector<Integer>>();
19
20
21 long sumOfHashham = 0;
22
15
16 public static Map<String, Vector<Integer>> gethyperVectors(
17 Map<String, Map<String, Integer>> vectorsOfHam,
18 List<Integer> seedList) {
19
20 Map<String, Vector<Integer>> allHamVectorInHyperplain = new HashMap<String, Vector<Integer>>();
21 // Map<String,Vector<Integer>> allSpamVectorInHyperplain = new
22 // HashMap<String,Vector<Integer>>();
23 // Map<String,Vector<Integer>> allQueryVectorInHyperplain = new
24 // HashMap<String,Vector<Integer>>();
25 // List<Map<String,Vector<Integer>>> allListVectorInHyperplain = new
26 // ArrayList<Map<String,Vector<Integer>>>();
27
28 // long sumOfHashspam = 0;
29 // long sumOfHashqueryam = 0;
30 //System.out.println("seedList " + seedList);
2331 for (Entry<String, Map<String, Integer>> e : vectorsOfHam.entrySet()) {
24
25 Vector<Integer> oneVectorInHyperplain = new Vector<Integer>(seedList.size());
26
27
32
33 Vector<Integer> oneVectorInHyperplain = new Vector<Integer>(
34 seedList.size());
35
2836 for (int seedi = 0; seedi < seedList.size(); seedi++) {
2937
3038 HashFunction h = Hashing.murmur3_128(seedList.get(seedi));
31
39 double sumOfHashham = 0;
40
3241 // hyperplain for training emails
33 for (Entry<String, Integer> entry : e.getValue()
34 .entrySet()) {
42 for (Entry<String, Integer> entry : e.getValue().entrySet()) {
3543 String k = entry.getKey();
3644 int v = entry.getValue();
3745
3846 HashCode hvofEmali = h
3947 .hashString(k, StandardCharsets.UTF_8);
4048
41 sumOfHashham += hvofEmali.asLong() * v;
42
49 sumOfHashham += ((double)hvofEmali.asLong()) * v;
4350
4451 }
4552
46
4753 if (sumOfHashham < 0) {
4854 oneVectorInHyperplain.add(0);
49
55
5056 } else if (sumOfHashham > 0) {
5157 oneVectorInHyperplain.add(1);
52
58
5359 } else {
5460 oneVectorInHyperplain.add(0);
5561 }
5662
5763 }
58 allHamVectorInHyperplain.put(e.getKey(),oneVectorInHyperplain);
59 }
64 allHamVectorInHyperplain.put(e.getKey(), oneVectorInHyperplain);
65 }
66
67// System.out.println(" allVectorInHyperplain"
68// + allHamVectorInHyperplain);
6069 return allHamVectorInHyperplain;
6170 }
6271
week2/src/main/java/BigData/week2/queryWithCloesestEmailSet.java
(61 / 63)
  
11package BigData.week2;
22
3import java.io.File;
43import java.io.IOException;
5import java.nio.charset.StandardCharsets;
64import java.util.ArrayList;
7import java.util.Collections;
85import java.util.HashMap;
96import java.util.List;
107import java.util.Map;
118import java.util.Map.Entry;
129import java.util.Vector;
1310
14import com.google.common.hash.HashCode;
15import com.google.common.hash.HashFunction;
16import com.google.common.hash.Hashing;
17
1811public class queryWithCloesestEmailSet {
1912
20 static Map<String,List<String>> hyperplainMaker(String folderPathOfSpamTraning,
21 String folderPathOfHamTraning, String folderPathofQuery,
22 List<Integer> seedList) throws IOException {
13 static Map<String, List<String>> hyperplainMaker(
14 String folderPathOfSpamTraning, String folderPathOfHamTraning,
15 String folderPathofQuery, List<Integer> seedList)
16 throws IOException {
2317 // TODO Auto-generated method stub
2418
25 Map<String,List<String>> QueryAndClosestEmailWithName = new HashMap<String,List<String>>();
26
27
28 Map<String,Map<String, Integer>> vectorsOfHam = readTrainningData
19 Map<String, List<String>> QueryAndClosestEmailWithName = new HashMap<String, List<String>>();
20
21 Map<String, Map<String, Integer>> vectorsOfHam = readTrainningData
2922 .readData3(folderPathOfHamTraning);
30 Map<String,Map<String, Integer>> vectorsOfSpam = readTrainningData
23 Map<String, Map<String, Integer>> vectorsOfSpam = readTrainningData
3124 .readData3(folderPathOfSpamTraning);
3225
33 Map<String,Map<String, Integer>> vectorsOfQuery = readTrainningData
26 Map<String, Map<String, Integer>> vectorsOfQuery = readTrainningData
3427 .readData3(folderPathofQuery);
3528
3629 // System.out.println("---------------------------------- query email of qure----------------------");
3730
38 Map<String,Vector<Integer>> allHamVectorInHyperplain = new HashMap<String,Vector<Integer>>();
39 Map<String,Vector<Integer>> allSpamVectorInHyperplain = new HashMap<String,Vector<Integer>>();
40 Map<String,Vector<Integer>> queryVectorInHyperplain = new HashMap<String,Vector<Integer>>();
31 Map<String, Vector<Integer>> allHamVectorInHyperplain = new HashMap<String, Vector<Integer>>();
32 Map<String, Vector<Integer>> allSpamVectorInHyperplain = new HashMap<String, Vector<Integer>>();
33 Map<String, Vector<Integer>> queryVectorInHyperplain = new HashMap<String, Vector<Integer>>();
4134
42 //read Ham
43
44
45 allHamVectorInHyperplain=hyperplainVector.gethyperVectors(vectorsOfHam, seedList);
46
47
48
49
50 //read spam
51 allSpamVectorInHyperplain=hyperplainVector.gethyperVectors(vectorsOfSpam, seedList);
52 //read query
53 queryVectorInHyperplain =hyperplainVector.gethyperVectors(vectorsOfQuery, seedList);
54
55 //compare query with email in hyperplain
56 for (Entry<String, Vector<Integer>> eq : queryVectorInHyperplain.entrySet())
57 {
58 List<String> oneEmailSetClosedToQuery=new ArrayList<String>();
59
60 for (Entry<String, Vector<Integer>> eh : allHamVectorInHyperplain.entrySet())
61 {
62 if(eq.getValue().equals(eh.getValue())){
35 // read query
36 queryVectorInHyperplain = hyperplainVector.gethyperVectors(
37 vectorsOfQuery, seedList);
38
39
40 // read spam
41 allSpamVectorInHyperplain = hyperplainVector.gethyperVectors(
42 vectorsOfSpam, seedList);
43 // read Ham
44 allHamVectorInHyperplain = hyperplainVector.gethyperVectors(
45 vectorsOfHam, seedList);
46
47 // System.out.println("allHamVectorInHyperplain--"+allHamVectorInHyperplain);
48 // System.out.println("allSpamVectorInHyperplain--"+allSpamVectorInHyperplain);
49 // System.out.println("queryVectorInHyperplain--"+queryVectorInHyperplain);
50
51 // compare query with email in hyperplain
52 for (Entry<String, Vector<Integer>> eq : queryVectorInHyperplain
53 .entrySet()) {
54 List<String> oneEmailSetClosedToQuery = new ArrayList<String>();
55
56 for (Entry<String, Vector<Integer>> eh : allHamVectorInHyperplain
57 .entrySet()) {
58// System.out.println("eq.getValue()" + eq.getValue());
59// System.out.println("eh.getValue()" + eh.getValue());
60// System.out.println("caonima eq" + eq.getKey());
61// System.out.println("caonima eh" + eh.getKey());
62 if (eq.getValue().equals(eh.getValue())) {
6363 oneEmailSetClosedToQuery.add(eh.getKey());
64 //System.out.println("caonima eh--"+eh.getKey());
64 // System.out.println("caonima eq--"+eq.getKey());
65 // System.out.println("caonima eh--"+eh.getKey());
6566 }
6667 }
67
68 for (Entry<String, Vector<Integer>> es : allSpamVectorInHyperplain.entrySet())
69 {
70// System.out.println("eq.getValue()"+eq.getValue());
71// System.out.println("es.getValue()"+es.getValue());
72
73 if(eq.getValue().equals(es.getValue())){
68
69 for (Entry<String, Vector<Integer>> es : allSpamVectorInHyperplain
70 .entrySet()) {
71// System.out.println("eq.getValue()--" + eq.getValue());
72// System.out.println("es.getValue()--" + es.getValue());
73// System.out.println("caonima eq--" + eq.getKey());
74// System.out.println("caonima es--" + es.getKey());
75 if (eq.getValue().equals(es.getValue())) {
7476 oneEmailSetClosedToQuery.add(es.getKey());
75 //System.out.println("caonima es--"+es.getKey());
77 // System.out.println("caonima es--"+es.getKey());
78 // System.out.println("caonima eq--"+eq.getKey());
7679 }
7780 }
78 //System.out.println("---jj----"+oneEmailSetClosedToQuery.size());
79 QueryAndClosestEmailWithName.put(eq.getKey(),oneEmailSetClosedToQuery);
81 // System.out.println("---jj----"+oneEmailSetClosedToQuery.size());
82 QueryAndClosestEmailWithName.put(eq.getKey(),
83 oneEmailSetClosedToQuery);
8084 }
81
82
83 //System.out.println("---queryWithCloesestEmailSet----");
84
85
86 return QueryAndClosestEmailWithName;
87
88
89 }
90
91
92
9385
86// System.out.println("---queryWithCloesestEmailSet----"
87// + QueryAndClosestEmailWithName);
88
89 return QueryAndClosestEmailWithName;
90
91 }
9492
9593}
week2/src/main/java/BigData/week2/realDistanceBetweenTwoemail.java
(43 / 38)
  
99import org.apache.commons.io.FileUtils;
1010
1111public class realDistanceBetweenTwoemail {
12
13 public static Map<String,Double> getDistance(String pathToQueryEmail,String pathToEmail2) throws IOException{
12
13 public static Map<String, Double> getDistance(String pathToQueryEmail,
14 String pathToEmail2) throws IOException {
1415 // TODO Auto-generated method stub
15 Map<String, Double> twoEmailDistance = new HashMap<String, Double>();
16
17 File file1 = new File(pathToQueryEmail);
16 Map<String, Double> twoEmailDistance = new HashMap<String, Double>();
17
18 File file1 = new File(pathToQueryEmail);
1819 String email1 = FileUtils.readFileToString(file1);
19 File file2 = new File(pathToEmail2);
20 File file2 = new File(pathToEmail2);
2021 String email2 = FileUtils.readFileToString(file2);
21
22
2223 Map<String, Integer> VectorEmail1 = emailConvert.convert(email1);
2324 Map<String, Integer> VectorEmail2 = emailConvert.convert(email2);
24
25
2526 double sqOfEmail1 = 0;
2627 double sqOfEmail2 = 0;
2728
2829 double sumOfMutiply = 0;
29 for (Entry<String, Integer> entry : VectorEmail1
30 .entrySet()) {
30 for (Entry<String, Integer> entry : VectorEmail1.entrySet()) {
3131 String qk = entry.getKey();
3232 int qv = entry.getValue();
3333 sqOfEmail1 += qv * qv;
3737 }
3838 }
3939
40
41 for (Entry<String, Integer> entry : VectorEmail2
42 .entrySet()) {
40 for (Entry<String, Integer> entry : VectorEmail2.entrySet()) {
4341
4442 int v = entry.getValue();
4543
4644 sqOfEmail2 += v * v;
4745 }
48
49 double middleVarable=sumOfMutiply/ (Math.sqrt(sqOfEmail1) * Math.sqrt(sqOfEmail2));
50 if(middleVarable >1){middleVarable=1;}
46
47 double middleVarable = sumOfMutiply
48 / (Math.sqrt(sqOfEmail1) * Math.sqrt(sqOfEmail2));
49 if (middleVarable > 1) {
50 middleVarable = 1;
51 }
5152 double distance = Math.acos(middleVarable);
5253 twoEmailDistance.put(pathToQueryEmail, distance);
53 //System.out.println("---realDistanceBetweenTwoemail----");
54 return twoEmailDistance;
54 // System.out.println("---realDistanceBetweenTwoemail----");
55 return twoEmailDistance;
5556 }
5657
57 public static Map<String,Double> getDistanceWithEmailName(String pathToQueryEmail,String pathToEmail2) throws IOException{
58 // TODO Auto-generated method stub
59 Map<String, Double> twoEmailDistance = new HashMap<String, Double>();
60
61 File file1 = new File(pathToQueryEmail);
58 public static Map<String, Double> getDistanceWithEmailName(
59 String pathToQueryEmail, String pathToEmail2) throws IOException {
60 Map<String, Double> twoEmailDistance = new HashMap<String, Double>();
61
62 File file1 = new File(pathToQueryEmail);
6263 String email1 = FileUtils.readFileToString(file1);
63 File file2 = new File(pathToEmail2);
64 File file2 = new File(pathToEmail2);
6465 String email2 = FileUtils.readFileToString(file2);
65
66
6667 Map<String, Integer> VectorEmail1 = emailConvert.convert(email1);
6768 Map<String, Integer> VectorEmail2 = emailConvert.convert(email2);
68
69
6970 double sqOfEmail1 = 0;
7071 double sqOfEmail2 = 0;
7172
7273 double sumOfMutiply = 0;
73 for (Entry<String, Integer> entry : VectorEmail1
74 .entrySet()) {
74 for (Entry<String, Integer> entry : VectorEmail1.entrySet()) {
7575 String qk = entry.getKey();
7676 int qv = entry.getValue();
7777 sqOfEmail1 += qv * qv;
8181 }
8282 }
8383
84
85 for (Entry<String, Integer> entry : VectorEmail2
86 .entrySet()) {
84 for (Entry<String, Integer> entry : VectorEmail2.entrySet()) {
8785
8886 int v = entry.getValue();
8987
9088 sqOfEmail2 += v * v;
9189 }
92
93 double middleVarable=sumOfMutiply/ (Math.sqrt(sqOfEmail1) * Math.sqrt(sqOfEmail2));
94 if(middleVarable >1){middleVarable=1;}
95 double distance = Math.acos(middleVarable);
96 twoEmailDistance.put(pathToEmail2, distance);
97 //System.out.println(distance+"pathToEmail2"+pathToEmail2);
98 return twoEmailDistance;
90
91 double cosineOfAngle = sumOfMutiply
92 / (Math.sqrt(sqOfEmail1) * Math.sqrt(sqOfEmail2));
93
94 if (cosineOfAngle > 1.001){
95 throw new Error();
96 }
97 if ( cosineOfAngle >= 0.9999999999999999) {
98 cosineOfAngle = 1;
99 }
100 double angle = Math.acos(cosineOfAngle);
101 twoEmailDistance.put(pathToEmail2, angle);
102 // System.out.println(distance+"pathToEmail2"+pathToEmail2);
103 return twoEmailDistance;
99104 }
100105
101106}
week2/target/classes/BigData/week2/Main.class
(28 / 29)
Binary files differ
week2/target/classes/BigData/week2/emailConvert.class
(7 / 7)
Binary files differ
week2/target/classes/BigData/week2/hyperplainVector.class
(8 / 8)
Binary files differ
week2/target/classes/BigData/week2/queryWithCloesestEmailSet.class
(5 / 5)
Binary files differ
week2/target/classes/BigData/week2/realDistanceBetweenTwoemail.class
(9 / 8)
Binary files differ