/*
	Dataset for tag recommender

	Author:   Steffen Rendle, http://www.libfm.org/
	modified: 2010-12-10

	Copyright 2010 Steffen Rendle, see license.txt for more information
*/

#ifndef __DATA_H__
#define __DATA_H__

#include <iostream>
#include <fstream>
#include <string>
#include <vector>

#include <math.h>
#include <assert.h>

#include "../../util/token_reader.h"
#include "../../util/util.h"
#include "../../util/matrix.h"
#include "../../util/smatrix.h"


class Dataset {
	private:
		void loadData(std::string filename);
		void loadTest(std::string filename);
		
	public:
		// all data is stored in the order (user,item,tag)
		SparseTensorBoolean data;
		SparseTensorBoolean test_data;
		SparseMatrixBoolean test_posts;
		
		int max_user_id, max_item_id, max_tag_id;
		
		Dataset(std::string filename) {
  			max_user_id = -1;
  			max_tag_id = -1;
  			max_item_id = -1;
  			std::cout << "read data file " << filename << "..."; std::cout.flush();
			loadData(filename); 		
		}	
		void loadTestSplit(std::string filename) {
			std::cout << "read test file " << filename << "..."; std::cout.flush();
			loadTest(filename); 	
		}
};


void Dataset::loadData(std::string filename) {
	data.fromFile(filename);
	int num_posts = 0;
	int num_triples = 0;	
	for(SparseTensorBoolean::const_iterator t = data.begin(); t != data.end(); ++t) {
		max_user_id = std::max(t->first, max_user_id);		
		for(SparseMatrixBoolean::const_iterator i = t->second.begin(); i != t->second.end(); ++i) {
			max_item_id = std::max(i->first, max_item_id);
			for(SparseVectorBoolean::const_iterator j = i->second.begin(); j != i->second.end(); ++j) {
				max_tag_id = std::max(*j, max_tag_id);
				num_triples++;
			}
			num_posts++;
		}
	}
	
	std::cout << std::endl;
  	std::cout << "number of users             " << max_user_id+1 << std::endl;
	std::cout << "number of items             " << max_item_id+1 << std::endl;
	std::cout << "number of tags              " << max_tag_id+1 << std::endl;
	std::cout << "number of posts             " << num_posts << std::endl;
	std::cout << "number of distinct triples  " << num_triples << std::endl;
}
		
		
void Dataset::loadTest(std::string filename) {
	test_data.fromFile(filename);	
	SparseVectorBoolean test_users;
	SparseVectorBoolean test_items;
	SparseVectorBoolean test_tags;
	
	for(SparseTensorBoolean::const_iterator t = test_data.begin(); t != test_data.end(); ++t) {
		for(SparseMatrixBoolean::const_iterator i = t->second.begin(); i != t->second.end(); ++i) {
			test_posts[t->first].insert(i->first);
		}
	}
	int num_posts = 0;
	int num_triples = 0;	
	for(SparseTensorBoolean::const_iterator t = test_data.begin(); t != test_data.end(); ++t) {
		test_users.insert(t->first);
		max_user_id = std::max(t->first, max_user_id);		
		for(SparseMatrixBoolean::const_iterator i = t->second.begin(); i != t->second.end(); ++i) {
			test_items.insert(i->first);
			max_item_id = std::max(i->first, max_item_id);
			for(SparseVectorBoolean::const_iterator j = i->second.begin(); j != i->second.end(); ++j) {
				test_tags.insert(*j);
				max_tag_id = std::max(*j, max_tag_id);
				num_triples++;
			}
			num_posts++;
			// Assertion: post in train and test should be distinct:
			{
				SparseTensorBoolean::const_iterator train_user = data.find(t->first);
				if (train_user != data.end()) {
					SparseMatrixBoolean::const_iterator train_post = train_user->second.find(i->first);
					if (train_post != train_user->second.end()) {
						std::string post_id = t->first + "," + i->first;
						throw "train and test overlap in post " + post_id;
					}
				}
			}
		}
	}
	std::cout << std::endl;
  	std::cout << "number of test users        " << test_users.size() << std::endl;
	std::cout << "number of test items        " << test_items.size() << std::endl;
	std::cout << "number of test tags         " << test_tags.size() << std::endl;
	std::cout << "number of test posts        " << num_posts << std::endl;
	std::cout << "number of d. test  triples  " << num_triples << std::endl;
}		

			
#endif /*DATA_H_*/
