My Project
|
00001 #ifndef COMP_H 00002 #define COMP_H 00003 00004 #include "MySets.h" 00005 #include "MyTools.h" 00006 00007 #include "distance.h" 00008 00015 class Compare{ 00016 public: 00024 virtual double operator () (vector <set <string> >* Cl1, vector < set <string> >* Cl2){return 0;}; 00025 00031 Compare* Produce(string s); 00032 00038 void setDist(string s){ 00039 dist* creator = new dist(); 00040 d = creator->Produce(s); 00041 delete creator; 00042 }; 00043 00045 Compare(){}; 00050 Compare(int size){graphSize = size;}; 00051 00060 int getNodes(vector <set <string> >* Cl1, vector < set <string> >* Cl2){ 00061 set<string> S1; 00062 set<string>::iterator it_ss; 00063 00064 for (sui t = 0; t < Cl1->size(); t++){ 00065 for (it_ss = (*Cl1)[t].begin(); it_ss != (*Cl1)[t].end(); it_ss++){ 00066 S1.insert(*it_ss); 00067 } 00068 } 00069 00070 for (sui t = 0; t < Cl2->size(); t++){ 00071 for (it_ss = (*Cl2)[t].begin(); it_ss != (*Cl2)[t].end(); it_ss++){ 00072 S1.insert(*it_ss); 00073 } 00074 } 00075 00076 return S1.size(); 00077 }; 00078 00079 double graphSize; 00081 dist* d; 00082 }; 00083 00090 class BestMatch:public Compare{ 00091 public: 00101 double operator () (vector <set < string > >* Cl1, vector<set <string> >* Cl2){ 00102 pair <map <int, set <int> >, map <int, set < int > > > Reduced = getSim(Cl1, Cl2); 00103 00104 if (d->type > 3){ 00105 cerr << "Invalid Similarity type for BestMatch measure" << endl; 00106 exit(3); 00107 } 00108 double total = 0, current = 0, min; 00109 00110 map <int, set <int> >::iterator it_m; 00111 set <int>::iterator it_si; 00112 int nodeCount = getNodes(Cl1, Cl2); 00113 00114 for (sui t = 0; t < Cl1->size(); t++){ 00115 if ((it_m = Reduced.first.find(t)) != Reduced.first.end()){ 00116 min = 10000000; 00117 for (it_si = it_m->second.begin(); it_si != it_m->second.end(); it_si++){ 00118 if ((current = (*d)((*Cl1)[it_m->first], (*Cl2)[*it_si], graphSize)) < min) 00119 min = current; 00120 } 00121 total += min; 00122 }else{ 00123 total += Cl1->at(t).size(); 00124 } 00125 } 00126 00127 for (sui t = 0; t < Cl2->size(); t++){ 00128 if ((it_m = Reduced.second.find(t)) != Reduced.second.end()){ 00129 min = 10000000; 00130 for (it_si = it_m->second.begin(); it_si != it_m->second.end(); it_si++){ 00131 if ((current = (*d)((*Cl1)[*it_si], (*Cl2)[it_m->first], nodeCount)) < min) 00132 min = current; 00133 } 00134 total += min; 00135 }else{ 00136 total += Cl2->at(t).size(); 00137 } 00138 } 00139 00140 return (double)(total) / (double)(Cl1->size() + Cl2->size()); 00141 }; 00142 00143 private: 00153 pair <map <int, set <int> >, map <int, set < int > > > getSim (vector <set < string > >* Cl1, vector<set <string> >* Cl2){ 00154 set<string>::iterator it_sl; 00155 set<int>::iterator it_si, it_si2; 00156 00157 map <string, set <int> > first, second; 00158 map <string, set <int> >::iterator it_f, it_s; 00159 00160 for (sui t = 0; t < Cl1->size(); t++){ 00161 for (it_sl = Cl1->at(t).begin(); it_sl != Cl1->at(t).end(); it_sl++){ 00162 if ((it_f = first.find(*it_sl)) != first.end()){ 00163 it_f->second.insert(t); 00164 } else { 00165 set <int> tmp; 00166 tmp.insert(t); 00167 first.insert(make_pair<string, set<int> >(*it_sl, tmp)); 00168 } 00169 } 00170 } 00171 00172 for (sui t = 0; t < Cl2->size(); t++){ 00173 for (it_sl = Cl2->at(t).begin(); it_sl != Cl2->at(t).end(); it_sl++){ 00174 if ((it_f = second.find(*it_sl)) != second.end()){ 00175 it_f->second.insert(t); 00176 } else { 00177 set <int> tmp; 00178 tmp.insert(t); 00179 second.insert(make_pair<string, set<int> >(*it_sl, tmp)); 00180 } 00181 } 00182 } 00183 00184 map <int, set <int> > A, B; 00185 map <int, set <int> >::iterator it_a; 00186 00187 for (it_f = first.begin(); it_f != first.end(); it_f++){ 00188 if ((it_s = second.find(it_f->first)) == second.end()) continue; 00189 00190 for (it_si = it_f->second.begin(); it_si != it_f->second.end(); it_si++){ 00191 if ((it_a = A.find(*it_si)) == A.end()){ 00192 set <int> tmp; 00193 A.insert(make_pair<int, set<int> > (*it_si, tmp)); 00194 it_a = A.find(*it_si); 00195 } 00196 for (it_si2 = it_s->second.begin(); it_si2 != it_s->second.end(); it_si2++){ 00197 it_a->second.insert(*it_si2); 00198 } 00199 } 00200 } 00201 00202 for (it_f = second.begin(); it_f != second.end(); it_f++){ 00203 if ((it_s = first.find(it_f->first)) == first.end()) continue; 00204 00205 for (it_si = it_f->second.begin(); it_si != it_f->second.end(); it_si++){ 00206 if ((it_a = B.find(*it_si)) == B.end()){ 00207 set <int> tmp; 00208 B.insert(make_pair<int, set<int> > (*it_si, tmp)); 00209 it_a = B.find(*it_si); 00210 } 00211 for (it_si2 = it_s->second.begin(); it_si2 != it_s->second.end(); it_si2++){ 00212 it_a->second.insert(*it_si2); 00213 } 00214 } 00215 } 00216 00217 return make_pair< map <int, set <int> >, map <int, set < int > > >(A, B); 00218 } 00219 }; 00220 00227 class KCenter:public Compare{ 00228 public: 00229 00235 void setDist(string s){ 00236 dist* creator = new dist(); 00237 d = creator->Produce(s); 00238 distType = s; 00239 delete creator; 00240 }; 00241 00250 double operator() (vector<set<string> >* Cl1, vector<set<string> >* Cl2){ 00251 if (d->type > 3){ 00252 cerr << "Invalid Similarity type for KCenter measure" << endl; 00253 exit(3); 00254 } 00255 00256 BestMatch* BM = new BestMatch(); 00257 BM->setDist(distType); 00258 00259 int K = 0.75 * (Cl1->size() + Cl2->size()) / 2; 00260 int N = getNodes(Cl1, Cl2); 00261 vector <set <string> > KC1 = getCenter(Cl1, K, N), KC2 = getCenter(Cl2, K, N); 00262 00263 cerr << "Retreived Centers" << endl; 00264 00265 return (*BM)(&KC1, &KC2); 00266 }; 00267 00268 private: 00269 00270 string distType; 00282 vector<set<string> > getCenter(vector<set<string> >* Cl, sui K, int N){ 00283 if (K < 1 || K > Cl->size()){ 00284 cerr << "Invalid K for centers" << endl; 00285 exit(3); 00286 } 00287 00288 vector< set <string> > ret; 00289 vector < double > dsts; 00290 dsts.resize(Cl->size(), 9999999); 00291 set <string> empty; 00292 00293 /* 00294 * Add First Cluster 00295 */ 00296 00297 double max = 0, index = 0, weight = 0; 00298 00299 for (sui t = 0; t < Cl->size(); t++){ 00300 if ((weight = (*d)(empty, (*Cl)[t], N)) > max){ 00301 max = weight; 00302 index = t; 00303 } 00304 } 00305 00306 ret.push_back((*Cl)[index]); 00307 empty = (*Cl)[index]; 00308 00309 /* 00310 * Greedily increase the center 00311 */ 00312 while (ret.size() != K){ 00313 /* 00314 * Recalculate Distances 00315 */ 00316 for (sui t = 0; t < Cl->size(); t++){ 00317 if ((weight = (*d)(empty, (*Cl)[t], N)) < dsts[t]) 00318 dsts[t] = weight; 00319 } 00320 /* 00321 * Add cluster furthest away 00322 */ 00323 max = dsts[0]; 00324 for (sui t = 0; t < dsts.size(); t++){ 00325 if (dsts[t] > max){ 00326 max = dsts[t]; 00327 index = t; 00328 } 00329 } 00330 00331 empty = (*Cl)[index]; 00332 ret.push_back(empty); 00333 } 00334 00335 return ret; 00336 }; 00337 }; 00338 00345 class Interaction:public Compare{ 00346 public: 00354 double operator() (vector<set<string> >* Cl1, vector<set<string> >* Cl2){ 00355 if (d->type < 4){ 00356 cerr << "Invalid Similarity type for Interaction measure" << endl; 00357 exit(3); 00358 } 00359 map < string, set <int> > S1, S2; 00360 map < string, set <int> >::iterator it_m, it_m2; 00361 set <int>::iterator it_si; 00362 set<string>::iterator it_ss, it_ss2; 00363 00364 for (sui t = 0; t < Cl1->size(); t++){ 00365 for (it_ss = (*Cl1)[t].begin(); it_ss != (*Cl1)[t].end(); it_ss++){ 00366 if ((it_m = S1.find(*it_ss)) != S1.end()){ 00367 it_m->second.insert(t); 00368 } else { 00369 set <int> tmp; 00370 tmp.insert(t); 00371 S1.insert(make_pair<string, set <int> >(*it_ss, tmp)); 00372 } 00373 } 00374 } 00375 00376 for (sui t = 0; t < Cl2->size(); t++){ 00377 for (it_ss = (*Cl2)[t].begin(); it_ss != (*Cl2)[t].end(); it_ss++){ 00378 if ((it_m = S2.find(*it_ss)) != S2.end()){ 00379 it_m->second.insert(t); 00380 } else { 00381 set <int> tmp; 00382 tmp.insert(t); 00383 S2.insert(make_pair<string, set <int> >(*it_ss, tmp)); 00384 } 00385 } 00386 } 00387 00388 int SC1 = S1.size(), SC2 = S2.size(); 00389 double cP = 0, prod = 0; 00390 00391 for (it_m = S1.begin(); it_m != S1.end(); it_m++){ 00392 it_m2 = it_m; 00393 for (++it_m2; it_m2 != S1.end(); it_m2++){ 00394 set <int> Int = Intersection< set<int> >(it_m->second, it_m2->second); 00395 prod = 1; 00396 for (it_si = Int.begin(); it_si != Int.end(); it_si++){ 00397 prod *= (1.0/double((*Cl1)[*it_si].size())); 00398 } 00399 cP = 1 - (1 - (1.0 / (double)SC1)) * prod; 00400 00401 pair <string, string> curr = make_pair<string, string> (it_m->first, it_m2->first); 00402 if ((it_p = P.find(curr)) != P.end()){ 00403 it_p->second.first = cP; 00404 } else { 00405 P.insert(make_pair<pair <string, string>, pair <double, double> >(curr, make_pair<double, double>(cP, 0.0))); 00406 } 00407 } 00408 } 00409 00410 for (it_m = S2.begin(); it_m != S2.end(); it_m++){ 00411 it_m2 = it_m; 00412 for (++it_m2; it_m2 != S2.end(); it_m2++){ 00413 set <int> Int = Intersection< set<int> >(it_m->second, it_m2->second); 00414 prod = 1; 00415 for (it_si = Int.begin(); it_si != Int.end(); it_si++){ 00416 prod *= (1.0/double((*Cl2)[*it_si].size())); 00417 } 00418 cP = 1 - (1 - (1.0 / (double)SC2)) * prod; 00419 00420 pair <string, string> curr = make_pair<string, string> (it_m->first, it_m2->first); 00421 if ((it_p = P.find(curr)) != P.end()){ 00422 it_p->second.second = cP; 00423 } else { 00424 P.insert(make_pair<pair <string, string>, pair <double, double> >(curr, make_pair<double, double>(0.0, cP))); 00425 } 00426 } 00427 } 00428 00429 /* 00430 * Put in a nice form 00431 */ 00432 vector <double> A, B; 00433 for (it_p = P.begin(); it_p != P.end(); it_p++){ 00434 A.push_back(it_p->second.first); 00435 B.push_back(it_p->second.second); 00436 } 00437 00438 P.clear(); 00439 /* 00440 * Calculate 00441 */ 00442 return (*d)(A, B, getNodes(Cl1, Cl2)); 00443 } 00444 00445 private: 00446 map <pair <string, string>, pair <double, double> > P; 00447 map <pair <string, string>, pair <double, double> >::iterator it_p; 00448 }; 00449 00457 class FScore:public Compare{ 00458 public: 00466 double operator () (vector <set <string> >* Cl1, vector < set <string> >* Cl2){ 00467 int tp = 0; 00468 set <int> ttp; 00469 for (sui i = 0; i < Cl2->size(); i++){ 00470 for (sui j = 0; j < Cl1->size(); j++){ 00471 if (getScore(Cl2->at(i), Cl1->at(j)) > 0.9){ 00472 tp++; 00473 ttp.insert(j); 00474 } 00475 } 00476 } 00477 00478 int fn = Cl1->size() - ttp.size(); 00479 int fp = Cl2->size() - tp; 00480 00481 double p = (double)tp / (double)(tp + fp); 00482 double r = (double)tp / (double)(tp + fn); 00483 00484 return (2 * p * r)/(p + r); 00485 } 00486 00487 private: 00494 double getScore (set <string> c1, set <string> c2){ 00495 set <string> inter = Intersection<set <string> >(c1, c2); 00496 00497 int tp = inter.size(); 00498 int fp = c2.size() - tp; 00499 int fn = c1.size() - tp; 00500 00501 double p = (double)tp / (double)(tp + fp); 00502 double r = (double)tp / (double)(tp + fn); 00503 00504 return (2 * p * r)/(p + r); 00505 } 00506 }; 00507 00514 class NMI:public Compare{ 00515 public: 00523 double operator () (vector <set <string> >* Cl1, vector < set <string> >* Cl2){ 00524 if (d->type != 3) 00525 { 00526 cerr << "Entropy Required as comparison" << endl; 00527 exit(22); 00528 } 00529 00530 double Hy = 0, Hx = 0; 00531 for (sui i = 0; i < Cl2->size(); i++){ 00532 Hy += (((*(entropy*)d)).h((*Cl2)[i].size(), graphSize) + ((*(entropy*)d)).h(graphSize - (*Cl2)[i].size(), graphSize)); 00533 } 00534 00535 for (sui i = 0; i < Cl1->size(); i++){ 00536 Hx += (((*(entropy*)d)).h((*Cl1)[i].size(), graphSize) + ((*(entropy*)d)).h(graphSize - (*Cl1)[i].size(), graphSize)); 00537 } 00538 00539 set <string>::iterator it_s; 00540 map <string, set <int> > m1, m2; 00541 map <string, set <int> >::iterator it_mm; 00542 sui minC1 = 0, minC2 = 0; 00543 sui minC1Size = numeric_limits<unsigned short int>::max(); 00544 sui minC2Size = minC1Size; 00545 /*Membership maps*/ 00546 for (sui i = 0; i < Cl1->size(); i++){ 00547 if ((*Cl1)[i].size() < minC1Size){ 00548 minC1 = i; 00549 minC1Size = (*Cl1)[i].size(); 00550 } 00551 00552 for (it_s = (*Cl1)[i].begin(); it_s != (*Cl1)[i].end(); it_s++){ 00553 if ((it_mm = m1.find(*it_s)) != m1.end()){ 00554 it_mm->second.insert(i); 00555 } else { 00556 set <int> temp; 00557 temp.insert(i); 00558 m1.insert(make_pair<string, set <int> > (*it_s, temp)); 00559 } 00560 } 00561 } 00562 00563 for (sui i = 0; i < Cl2->size(); i++){ 00564 if ((*Cl2)[i].size() < minC2Size){ 00565 minC2 = i; 00566 minC2Size = (*Cl2)[i].size(); 00567 } 00568 00569 for (it_s = (*Cl2)[i].begin(); it_s != (*Cl2)[i].end(); it_s++){ 00570 if ((it_mm = m2.find(*it_s)) != m2.end()){ 00571 it_mm->second.insert(i); 00572 } else { 00573 set <int> temp; 00574 temp.insert(i); 00575 m2.insert(make_pair<string, set <int> > (*it_s, temp)); 00576 } 00577 } 00578 } 00579 00580 double Hxy = 0; 00581 for (sui i = 0; i < Cl1->size(); i++){ 00582 set <int> potents; 00583 for (it_s = (*Cl1)[i].begin(); it_s != (*Cl1)[i].end(); it_s++){ 00584 if ((it_mm = m2.find(*it_s)) != m2.end()){ 00585 potents.insert(it_mm->second.begin(), it_mm->second.end()); 00586 } 00587 } 00588 00589 if (potents.size() == 0){ 00590 Hxy += (*d)((*Cl1)[i], (*Cl2)[minC2], graphSize); 00591 continue; 00592 } 00593 00594 double minimum = numeric_limits<double>::max(); 00595 for (set <int>::iterator j = potents.begin(); j != potents.end(); j++) 00596 //for (sui j = 0; j < Cl2->size(); j++) 00597 { 00598 double current = (*d)((*Cl1)[i], (*Cl2)[*j], graphSize); 00599 if (current < minimum) 00600 minimum = current; 00601 } 00602 00603 Hxy += minimum; 00604 } 00605 00606 double Hyx = 0; 00607 for (sui i = 0; i < Cl2->size(); i++){ 00608 set <int> potents; 00609 for (it_s = (*Cl2)[i].begin(); it_s != (*Cl2)[i].end(); it_s++){ 00610 if ((it_mm = m1.find(*it_s)) != m1.end()){ 00611 potents.insert(it_mm->second.begin(), it_mm->second.end()); 00612 } 00613 } 00614 00615 if (potents.size() == 0){ 00616 Hxy += (*d)((*Cl2)[i], (*Cl1)[minC1], graphSize); 00617 continue; 00618 } 00619 00620 double minimum = numeric_limits<double>::max(); 00621 for (set <int>::iterator j = potents.begin(); j != potents.end(); j++){ 00622 //for (sui j = 0; j < Cl1->size(); j++) 00623 00624 double current = (*d)((*Cl2)[i], (*Cl1)[*j], graphSize); 00625 if (current < minimum) 00626 minimum = current; 00627 } 00628 Hyx += minimum; 00629 } 00630 00631 double Ixy = (Hx - Hxy + Hy - Hyx) / 2; 00632 00633 return (Ixy / max(Hx, Hy)); 00634 }; 00635 00636 private: 00637 00638 }; 00639 00647 Compare* Compare::Produce(string s){ 00648 if ((strcmp(s.c_str(), "best")) == 0){ 00649 return new BestMatch(); 00650 } else if ((strcmp(s.c_str(), "kcent")) == 0){ 00651 return new KCenter(); 00652 } else if ((strcmp(s.c_str(), "inter")) == 0){ 00653 return new Interaction(); 00654 } else if ((strcmp(s.c_str(), "fscore")) == 0){ 00655 return new FScore(); 00656 } else if ((strcmp(s.c_str(), "nmi")) == 0){ 00657 return new NMI(); 00658 } else { 00659 cerr << "Invalid comparison method chosen" << endl; 00660 exit(4); 00661 } 00662 }; 00663 00664 #endif