#include "profile.h" #include <iostream> #include <sstream> #include <boost/foreach.hpp> #define foreach BOOST_FOREACH using namespace std; namespace fs = boost::filesystem; Profile::Profile(const string &pn) { pname = pn; int profilecounter=0; if (pname.at(pname.length()-1) == '/') pname.erase(pname.length()-1, 1); //remove trailing slash fs::path ip(pname); if (!fs::is_directory(ip)) { cerr << "Profile path '" << ip << "' not found. Cannot continue"; exit(1); } ifstream virtualf(string(ip.string()+"/virtual").c_str()); if (virtualf.is_open()) { //virtual profile, so add actual profiles string prof_count; getline(virtualf, prof_count); while (!virtualf.eof()) { // erase quotes int prof_start = prof_count.find_first_of('"',0)+1; int prof_end = prof_count.find_last_of('"'); string prof = prof_count.substr(prof_start, prof_end-prof_start); profiles.push_back(fs::path(ip.parent_path()/prof)); // we extend the TSDB virtual profile syntax by allowing an // optional number of repetitions of the actual profile unsigned int reps; istringstream repnum(prof_count.substr(prof_end+1)); repnum >> reps; if (repnum.fail()) { reps = 1; } profileRepCount.push_back(reps); getline(virtualf, prof_count); } } else { profiles.push_back(ip); //not virtual profileRepCount.push_back(1); } for (vector<fs::path>::iterator piter = profiles.begin(); piter != profiles.end(); ++piter) { // these need to be declare in scope, although they are // assigned in the openf function boost::iostreams::filtering_stream<boost::iostreams::input> in; ifstream fstream; map<File, map<Relation, Field> > *rptr = new map<File, map<Relation, Field> >; ifstream rfile(string((*piter).string()+"/relations").c_str()); if (rfile.is_open()) { string line,fname, relation; int fieldcount = 0; getline(rfile, line); while (!rfile.eof()) { if (line.empty()){ getline(rfile, line); continue; } if (!isspace(line.at(0))) {//file name fname = line.substr(0, line.find(':')); (*rptr)[fname] = map<Relation,Field>(); fieldcount = 0; } else { while (isspace(line.at(0))) line.erase(0,1); relation = line.substr(0,line.find_first_of(" ")); (*rptr)[fname][relation] = fieldcount++; } getline(rfile, line); } relations.push_back(rptr); rfile.close(); } else { cerr << "No relation file in profile. Exiting." << endl; exit(1); } map<tIid, string> *iptr = new map<tIid, string>; if (openf(in, fstream, string((*piter).string()+"/item"))) { string line; getline(in, line); while (!in.eof()) { int item_id; istringstream(getField(line, (*rptr)["item"]["i-id"])) >> item_id; iptr->insert(tISValue(item_id, getField(line, (*rptr)["item"]["i-input"]))); itemToProfile.insert(tIidIValue(item_id, profilecounter)); getline(in, line); } items.push_back(iptr); fstream.close(); in.reset(); } else { cerr << "Problem opening item file in " << (*piter).string() << endl; } map<tIid, int> *readingsptr = new map<tIid, int>; map<tIid, string> *pinputptr = new map<tIid, string>; map<tIid, map<int, int> > *prefptr = new map<tIid, map<int, int> >; if (openf(in, fstream, string((*piter).string()+"/parse"))) { string line; getline(in,line); while (!in.eof()) { int item_id, num_readings; istringstream(getField(line, (*rptr)["parse"]["i-id"])) >> item_id; istringstream(getField(line, (*rptr)["parse"]["readings"])) >> num_readings; readingsptr->insert(tIidIValue(item_id, num_readings)); pinputptr->insert(tISValue(item_id, getField(line, (*rptr)["parse"]["p-input"]))); prefptr->insert(map<tIid,map<int,int> >::value_type(item_id, map<int,int>())); getline(in, line); } readings.push_back(readingsptr); inputs.push_back(pinputptr); fstream.close(); in.reset(); } else { cerr << "Problem opening parse file in " << (*piter).string() << endl; } if (openf(in, fstream, string((*piter).string()+"/preference"))) { string line; getline(in,line); while (!in.eof()) { tIid item_id; int parse_id; istringstream(getField(line, (*rptr)["preference"]["parse-id"])) >> item_id; istringstream(getField(line, (*rptr)["preference"]["result-id"])) >> parse_id; ((*prefptr)[item_id]).insert(tIidIValue(parse_id, 1)); getline(in, line); } fstream.close(); in.reset(); } else { cerr << "Problem opening preference file in " << (*piter).string() << "; no gold preferences recorded" << endl; } preferences.push_back(prefptr); profilecounter++; } if (!profilecounter) { cerr << "No profiles were loaded from " << pname << ". Cannot continue." << endl; exit(1); } resultprofilecount = 0; autoPrefsTopN = -1; } Profile::~Profile() { for (vector<map<tIid,string> *>::iterator iter = items.begin(); iter != items.end(); ++iter) { delete *iter; } for (vector<map<tIid,map<int,int> > *>::iterator iter = preferences.begin(); iter != preferences.end(); ++iter) { delete *iter; } } bool Profile::openf( boost::iostreams::filtering_stream<boost::iostreams::input> &in, ifstream &filestream, const string &fname) { filestream.open(string(fname+".gz").c_str(), ios_base::in | ios_base::binary); if (filestream.is_open()) { in.push(boost::iostreams::gzip_decompressor()); in.push(filestream); } else { filestream.open(fname.c_str()); if (filestream.is_open()) in.push(filestream); else return false; } return true; } string Profile::getItem(tIid item) { if (itemToProfile.count(item)) { if (items[itemToProfile[item]]->count(item)) { return (items[itemToProfile[item]]->find(item))->second; } } return string(); } string Profile::getInput(tIid item) { if (itemToProfile.count(item)) { if (inputs[itemToProfile[item]]->count(item)) { return (inputs[itemToProfile[item]]->find(item))->second; } } return string(); } int Profile::getPreference(tIid item) { if (itemToProfile.count(item)) { if (preferences[itemToProfile[item]]->count(item)) { if (!(*preferences[itemToProfile[item]])[item].empty()) return (*preferences[itemToProfile[item]])[item].begin()->first; } } return -1; } void Profile::setPreference(tIid &item, int &parse, bool append) { if (itemToProfile.count(item)) { if (append==false) (*preferences[itemToProfile[item]])[item].clear(); (*preferences[itemToProfile[item]])[item].insert(tIIValue(parse,1)); } } bool Profile::isGold(tIid item, int parse) { if ((*preferences[itemToProfile[item]])[item].count(parse) == 1 && ((*preferences[itemToProfile[item]])[item])[parse] == 1) return true; else return false; } int Profile::numGold(tIid item) { return (*preferences[itemToProfile[item]])[item].size(); } int Profile::getReadings(tIid item) { if (itemToProfile.count(item)) { if (readings[itemToProfile[item]]->count(item)) { return (readings[itemToProfile[item]]->find(item))->second; } } return -1; } pair<pair<tIid,int>,string> Profile::getResult(ResultType type) { if (resultfilter.empty()) { //first result if (!openf(resultfilter, resultfile, string(profiles[0].string()+"/result"))) { cerr << "Couldn't open a result file in " << profiles[0].string() << endl; return pair<pair<tIid,int>,string>(pair<tIid,int>(-1,-1), string()); } } string line; getline(resultfilter,line); while (resultfilter.eof()) { resultprofilecount++; if (resultprofilecount == profiles.size()) { //read all profiles resultfile.close(); resultfilter.reset(); return pair<pair<tIid,int>,string>(pair<tIid,int>(-2,-1), string()); } else { resultfile.close(); resultfilter.reset(); if (!openf(resultfilter, resultfile, string(profiles[resultprofilecount].string()+"/result"))) { cerr << "Couldn't open a result file in " << profiles[resultprofilecount].string() << endl; return pair<pair<tIid,int>,string>(pair<tIid,int>(-3,-1), string()); } } getline(resultfilter,line); } if (resultfilter.eof()) { //last line return pair<pair<tIid,int>,string>(pair<tIid,int>(-4,-1), string()); } tIid item_id; int parse_id; istringstream(getField(line, (*(relations[resultprofilecount]))["result"]["parse-id"])) >> item_id; istringstream(getField(line, (*(relations[resultprofilecount]))["result"]["result-id"])) >> parse_id; string rtype("derivation"); if (type == MRS) rtype = string("mrs"); return pair<pair<tIid,int>,string>(pair<tIid,int>(item_id, parse_id), getField(line, (*(relations[resultprofilecount]))["result"][rtype])); } string Profile::getResult(tIid &item, int &parse, ResultType type) { if (itemToProfile.count(item)) { boost::iostreams::filtering_stream<boost::iostreams::input> in; ifstream fstream; if (openf(in, fstream, string(profiles[itemToProfile[item]].string()+"/result"))) { string line; getline(in,line); while (!in.eof()) { tIid item_id; int parse_id; istringstream(getField(line, (*(relations[resultprofilecount]))["result"]["parse-id"])) >> item_id; istringstream(getField(line, (*(relations[resultprofilecount]))["result"]["result-id"])) >> parse_id; if (item_id == item && parse_id == parse) { fstream.close(); in.reset(); string rtype("derivation"); if (type == MRS) rtype = string("mrs"); return getField(line, (*(relations[resultprofilecount]))["result"][rtype]); } getline(in, line); } fstream.close(); in.reset(); cerr << "result " << parse << " of item " << item << " not found in result file in " << profiles[itemToProfile[item]].string() << endl; return string(); } else { cerr << "Problem opening result file in " << profiles[itemToProfile[item]].string() << endl; return string(); } } else { cerr << "No item " << item << "in profile " << pname << endl; return string(); } } set<tIid>* Profile::getItemIDs() { set<tIid>* item_ids = new set<tIid>(); for (vector<map<tIid, string>*>::const_iterator pi_itr = items.begin(); pi_itr != items.end(); pi_itr++) { foreach (tIidSValue iid_value, **pi_itr) { item_ids->insert(iid_value.first); } } return item_ids; } unsigned int Profile::getRepsForItem(tIid item_id) { // returns the number of times we should duplicate the actual item // due do to duplication requested in the training profile return profileRepCount[itemToProfile[item_id]]; } string &Profile::getPath() { return pname; } bool Profile::isVirtual() { return fs::exists(fs::path(pname) / "virtual"); // we could have a profile count of 1 and still have a virtual profile // (technically) so need to check for the filename } void Profile::scoreProfile(const string &scorefile, bool all, int n) { int total_exact = 0; int total_seen = 0; set<tIid>* test_item_ids = getItemIDs(); for (set<tIid>::iterator iiter=test_item_ids->begin(); iiter != test_item_ids->end(); ++iiter) { if (!all && getReadings(*iiter) == 0) continue; // skip items without results if all==false ++total_seen; for (int r = 0; r < n; ++r) { if (isGold(*iiter, r)) { ++total_exact; break; } } } //TODO change filename to reflect N string outfilename = scorefile + ".cvexact"; ofstream outf(outfilename.c_str()); if (outf.is_open()) { outf << (float) total_exact / total_seen << "\n" << total_exact << "/" << total_seen << endl; } outf.close(); } string Profile::getField(string line, int field) { int start = 0; int end = line.find('@'); int atcount; for (atcount=0; atcount < field && end != string::npos; atcount++) { start = end+1; end = line.find('@', start); } if (atcount != field) { cerr << "Mal-formed line: " << line << endl; cerr << "Insufficent delimiters." << endl; exit(1); } return line.substr(start, end-start); } void Profile::setPrefsFromTopNResults(int top_n) { set<tIid> *item_ids = getItemIDs(); for (set<tIid>::const_iterator iid_iter = item_ids->begin(); iid_iter != item_ids->end(); ++iid_iter) { tIid item = *iid_iter; int readings = getReadings(item); for (int i = 0; i < readings && i < top_n; ++i) { setPreference(item, i, false); // overwrites all gold prefs, plus more (those w/o gold marked as well) } } delete item_ids; autoPrefsTopN = top_n; } int Profile::topNForAutoPrefs() { return autoPrefsTopN; }