import os 

import sys
import numpy as np
from scipy import stats
import utilities


#function to get the hi and pi dimensions from AMiner data set, too big for excel
def FilterAMiner(dataPath):

    data = np.genfromtxt(dataPath, delimiter = ",",skip_header = 0,skip_footer = 0, dtype=None, names = True)
    finalData = np.column_stack((data["hi"], data["pi"]))
    np.savetxt(dataPath.replace(".csv","_filtered.csv"), finalData, delimiter = ',',header = 'HI,PI', comments = '') 


def ConvertDataForCorr(dataFiles, dataFilesDir):

    for dataFileName in dataFiles:

        print "processing " + dataFileName

        data = np.genfromtxt(dataFilesDir + os.path.sep + dataFileName, delimiter = ",",skip_header = 0,skip_footer = 0, dtype=None, names = True)
        k = np.zeros(len(data))
        o = np.zeros(len(data))

        X = []
        Y = []

        for tuple in data:
            X.append(tuple[0])
            Y.append(tuple[1])
        corr_r = stats.pearsonr(X,Y)[0]
        r = np.empty(len(data))
        r[:] = corr_r

        final_data = np.column_stack((X,Y,k,o,r))
        outputFilePath = dataFileName.replace(".csv","_converted.csv")

        csvHeader =  data.dtype.names

        np.savetxt(dataFilesDir + os.path.sep + outputFilePath, final_data, delimiter = ',', header = ','.join(csvHeader)+",k,o,r", comments = '') 


def ConvertDataForOutliers(dataFiles, dataFilesDir):
    ConvertDataForCorr(dataFiles,dataFilesDir)

def ConvertDataForClusters(dataFiles, dataFilesDir):

    for dataFileName in dataFiles:

        print "processing " + dataFileName

        data = np.genfromtxt(dataFilesDir + os.path.sep + dataFileName, delimiter = ",",skip_header = 0,skip_footer = 0, dtype=None, names = True)
        k = []
        o = np.zeros(len(data))

        X = []
        Y = []

        for tuple in data:
            X.append(tuple[0])
            Y.append(tuple[1])
            k.append(tuple[2])

        corr_r = stats.pearsonr(X,Y)[0]
        r = np.empty(len(data))
        r[:] = corr_r

        final_data = np.column_stack((X,Y,k,o,r))
        outputFilePath = dataFileName.replace(".csv","_converted.csv")

        csvHeader =  np.array(data.dtype.names)
        csvHeader = np.delete(csvHeader, 2)

        np.savetxt(dataFilesDir + os.path.sep + outputFilePath, final_data, delimiter = ',', header = ','.join(csvHeader)+",k,o,r", comments = '') 


dataFilesDir = ".." + os.path.sep + "data" + os.path.sep + "study2Data/corr" #"study2Data/outliers" #"study2Data/corr" #"study2Data/clusters"

outDataDir = dataFilesDir

#FilterAMiner(dataFilesDir + "/AMiner-Author_filre_hi_pi.csv")
#
#sys.exit(0)

dataFileNames = utilities.getFileNamesFromDir(dataFilesDir,"csv")

ConvertDataForCorr(dataFileNames, dataFilesDir)
#ConvertDataForClusters(dataFileNames, dataFilesDir)
#ConvertDataForOutliers(dataFileNames, dataFilesDir)

