"""
Generates random scatterplot data with different types of distributions 

Author: Luana Micallef, August 2015
"""


import numpy as np
import os
import matplotlib.pyplot as plt
import utilities

import model
import ellipse




# save a list of x, y points to a csv file 
def savePntsToCVSFile(pnts, outputFilePath):
    outfile = open(outputFilePath, 'w')
    outfile.write("x,y\n")
    
    for pnt in pnts:
        outfile.write(str(pnt[0])+','+str(pnt[1])+'\n') 
    
    outfile.close()
    return


# save a list of x, y, k, o, r to a csv file, where (x,y) is a point, k is the cluster ID, 
# o is 0 if the point is not an outlier and 1 if it is an outlier, r is the correlation
def savePntsWithClusterIdToCVSfile(pnts, outputFilePath, append=False):
    if (append):
        outfile = open(outputFilePath, 'a')
    else: 
        utilities.ensureDirectory(outputFilePath)
        outfile = open(outputFilePath, 'w')
        outfile.write("x,y,c,o,r\n")
    
    for x,y,k,o,r in pnts:
        outfile.write(str(x)+','+str(y)+','+str(k)+','+str(o)+','+str(r)+'\n') 
    
    outfile.close()
    return



# generates N random points from a uniform distribution in [0,1]
def pntsFromUniformDist(N, outputFilePath = ""):
    
    pnts = np.random.rand(N,2)
    
    if (outputFilePath != ""):
        savePntsToCVSFile(pnts, outputFilePath)
    
    return pnts
    


# generates N random points from a standard normal distribution
# more about distribution: http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.randn.html
# instead of this function can use corrpntsFromMultiVarNormalDist() with the corr=0
def pntsFromNormalDist(N, outputFilePath = ""):
    
    pnts = np.random.randn(N,2)
    
    if (outputFilePath != ""):
        savePntsToCVSFile(pnts, outputFilePath)
    
    return pnts
    


# generates N random points from a multivariate normal distribution specified by the distribution's mean matrix and covariance matrix
# more about distribution: http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.multivariate_normal.html
def pntsFromMultiVarNormalDist(N, means, covs, outputFilePath = ""):
    
    x, y = np.random.multivariate_normal(means, covs, N).T
    
    pnts = []
    for i in range(0,N):
        pnts.append([x[i],y[i]])
    
    if (outputFilePath != ""):
        savePntsToCVSFile(pnts, outputFilePath)
    
    return pnts
        
    
    
# generates N random points with a specific bivariate correlation (i.e., corr) from a multivariate normal distribution specified by the distribution's mean matrix and covariance matrix
# based on http://stackoverflow.com/questions/18683821/generating-random-correlated-x-and-y-points-using-numpy -> 
# "you can set up the sigmas so that your half-interval correspond to 3 standard deviations (you can also filter out the bad points if needed). In this way you will have ~99% of your points inside your interval"
def corrpntsFromMultiVarNormalDist(N, means, stds, corr, outputFilePath = ""): #, xx_r = [-0.51, 51.2], yy_r = [0.33, 51.6]):

    #xx = np.array(xx_r)
    #yy = np.array(yy_r)
    #means = [xx.mean(), yy.mean()]        
    #stds = [xx.std() / 3, yy.std() / 3] 
    
    #randnos = np.random.rand(4)   
    #ri = np.random.randint(1,50)
    #ri2 = np.random.randint(1,50)
    #means = [randnos[0]*ri,randnos[1]*ri]
    #stds = [randnos[2]*ri2,randnos[2]*ri 2]
    
    covs = [[stds[0]**2          , stds[0]*stds[1]*corr],
            [stds[0]*stds[1]*corr,           stds[1]**2]] 

    pnts = pntsFromMultiVarNormalDist(N, means, covs, outputFilePath)

    return pnts
    
    
    
# generates a total of N random points from a standard normal distribution 
# these N points have K clusters, where each cluster has at least int(N/K) points, 
# but a few clusters could hae one extra point 
# the correlation of each cluster is random in [0,1]
# more info about make_blobs http://scikit-learn.org/stable/auto_examples/datasets/plot_random_dataset.html
def pntsFromNormalDistWithKClustersAndOutliers (N, K, corr = None, outputDirPath = "", filesuffix = ""):

    # calculate the number of outliers and the number of points per cluster
    noOfPointsPerCluster = int(N/K) 
    noOfRemainingPoints = N%K
    
    # get the correlation values to pick from randomly for the clusters
    #rs = np.linspace(-1.0, 1.0, num=10, endpoint=True)
    rs=np.arange(-1.0,1.25,0.25)

    # initiate points array and the random stds and means
    pnts = []
    randfloats_std = np.random.uniform(0.0,10.0,[2])
    stds = [randfloats_std[0], randfloats_std[1]]
    
    # get random points for each cluster
    for k in range(0,K):
        if (corr == None):
            r_index = np.random.randint(0,rs.size-1)
            r = round(rs[r_index],2)
            rs = np.delete(rs,r_index)
        else: 
            r = corr
        
        randfloats_means = np.random.uniform(0.0,10.0,[2])
        means = [randfloats_means[0], randfloats_means[1]]
    
        n_k = noOfPointsPerCluster
        if (noOfRemainingPoints > 0):
            n_k = n_k + 1 
            noOfRemainingPoints = noOfRemainingPoints - 1
            
        pnts_k = corrpntsFromMultiVarNormalDist (n_k, means, stds, r, outputFilePath="")
        
        isoutlierBoolFlags = is_outlier(np.array(pnts_k), 2.5)
        isoutlierIntFlags = [1 if (b==True) else 0 for b in isoutlierBoolFlags]
        pnts_k_kor = [[x,y,k,o,r] for [x,y],o in zip(pnts_k,isoutlierIntFlags)]
        pnts = pnts + pnts_k_kor
                 
    # save to file  
    if (outputDirPath != ""):
        outputFilePath = outputDirPath + os.path.sep + "randNorm_"+str(N)+"pnts_"+str(K)+"clusters"+filesuffix+".csv" 
        savePntsWithClusterIdToCVSfile(pnts, outputFilePath, append=False)
    
    return np.array(pnts)

# generates a total of N random points from a standard normal distribution with manually added outliers  
def pntsFromNormalDistWithKClustersAndManuallyAddedOutliers (N, K, O, corr = None, outputDirPath = "", filesuffix = ""):

    # calculate the number of outliers and the number of points per cluster
    noOfPointsPerCluster = int(N/K) 
    noOfRemainingPoints = N%K
    
    # get the correlation values to pick from randomly for the clusters
    #rs = np.linspace(-1.0, 1.0, num=10, endpoint=True)
    rs=np.arange(-1.0,1.25,0.25)

    # initiate points array and the random stds and means
    pnts = []
    randfloats_std = np.random.uniform(0.0,10.0,[2])
    stds = [randfloats_std[0], randfloats_std[1]]
    
    # get random points for each cluster
    for k in range(0,K):
        if (corr == None):
            r_index = np.random.randint(0,rs.size-1)
            r = round(rs[r_index],2)
            rs = np.delete(rs,r_index)
        else: 
            r = corr
        
        randfloats_means = np.random.uniform(0.0,10.0,[2])
        means = [randfloats_means[0], randfloats_means[1]]
    
        n_k = noOfPointsPerCluster
        if (noOfRemainingPoints > 0):
            n_k = n_k + 1 
            noOfRemainingPoints = noOfRemainingPoints - 1
            
        pnts_k = corrpntsFromMultiVarNormalDist (n_k, means, stds, r, outputFilePath="")
        
        pnts_k_filtered = []
        isoutlierIntFlags_forFiltered = []
        
        isoutlierBoolFlags = is_outlier(np.array(pnts_k), 3.0)
        for p_i in range(0,len(pnts_k)):
            if (not isoutlierBoolFlags[p_i]):
                pnts_k_filtered.append(pnts_k[p_i])
                isoutlierIntFlags_forFiltered.append(0)
        """
        # not fully optimal to use this
        for p in pnts_k:
            if (not isOutlierBasedOnSD(p, means, stds, sdOutlierThresh=1.5)):
                pnts_k_filtered.append(p)
                isoutlierIntFlags_forFiltered.append(0)
        """
        
        covell = model.getCovEllipseForPoints(np.array(pnts_k_filtered),3.0)
        for o in range(0,O): 
            while True:
                randOutlierPnt = generateRandomOutlierPointForNormDist(means, stds, sdOutlierThresh=3.0) 
                if (not randOutlierPnt in pnts_k_filtered) and (not covell.isPointInEllipse(randOutlierPnt[0],randOutlierPnt[1])):
                    break
            pnts_k_filtered.append(randOutlierPnt)
            isoutlierIntFlags_forFiltered.append(1)
    
        pnts_k_kor = [[x,y,k,o,r] for [x,y],o in zip(pnts_k_filtered,isoutlierIntFlags_forFiltered)]
        pnts = pnts + pnts_k_kor
                 
    # save to file  
    if (outputDirPath != ""):
        outputFilePath = outputDirPath + os.path.sep + "randNorm_"+str(N)+"pnts_"+str(K)+"clusters"+filesuffix+".csv" 
        savePntsWithClusterIdToCVSfile(pnts, outputFilePath, append=False)
    
    return np.array(pnts)
    
    
    
    
# Generate plots each one with one cluster and one of the correlations in the linspace that we are assessing    
# ****** N.B. Not fully functional due to some changes to some functions (e.g., corrpntsFromMultiVarNormalDist)
   
def generateDataset1ClusterDifferentCorrelations (N, outputDirPath = ""):

    # get the correlation values to pick from randomly for the clusters
    rs = np.linspace(-1.0, 1.0, num=20, endpoint=True)

    # get random points for each cluster
    i=0
    for r in rs:
        randnos = np.random.rand(4)
        pnts_k = corrpntsFromMultiVarNormalDist (N, r, outputFilePath="", xx_r = [randnos[0], randnos[1]], yy_r = [randnos[2],randnos[3]])
        #pnts_k_norm = normalizePoints(np.array(pnts_k),[0.0,1.0],[0.0,1.0])
        isoutlierBoolFlags = is_outlier(np.array(pnts_k_norm), 3)
        isoutlierIntFlags = [1 if (b==True) else 0 for b in isoutlierBoolFlags]
        pnts = [[x,y,0,o,r] for [x,y],o in zip(pnts_k_norm,isoutlierIntFlags)]
    
        # save to file  
        if (outputDirPath != ""):
            outputFilePath = outputDirPath + os.path.sep + "randNorm_"+str(N)+"pnts_1cluster_"+str(i)+".csv"      
            savePntsWithClusterIdToCVSfile(pnts, outputFilePath, append=False)
        
        i+=1        
    
    return



# Normalize the points to the required scale
# inputs: pnts as numpy array, x and y range dest in the form of [min, max]
def normalizePoints(pnts, xs_range_dst, ys_range_dst):  
    xs, ys = pnts.T 
    
    xs_range_src = [min(xs), max(xs)] 
    ys_range_src = [min(ys), max(ys)] 
    
    xs_norm = [scale(x, xs_range_src, xs_range_dst) for x in xs]
    ys_norm = [scale(y, ys_range_src, ys_range_dst) for y in ys]

    return np.array(zip(xs_norm, ys_norm))
    


# Scale the given value from the scale of source to the scale of destination
def scale(val, src, dst):
    return (((val-src[0])/float(src[1]-src[0]))*(dst[1]-dst[0]))+dst[0]


# Get + and - values that are K standard deviations away from mean
def getValuesKSDsAwayMean(mean, sd, k):
    return [mean - (sd*k), mean + (sd*k)]


# Generate random outlier point given the standard deviation of x and y data points in the distribution 
# By default an outlier that is +/- 3 standard deviations away from mean is returned  
def generateRandomOutlierPointForNormDist (means_xy, sd_xy, sdOutlierThresh=3):
    noOfSD_bbox = sdOutlierThresh+0.5
    
    x_outlier_limits = getValuesKSDsAwayMean(means_xy[0], sd_xy[0], sdOutlierThresh)
    x_bbox_limits = getValuesKSDsAwayMean(means_xy[0], sd_xy[0], sdOutlierThresh*noOfSD_bbox)
    
    y_outlier_limits = getValuesKSDsAwayMean(means_xy[1], sd_xy[1], sdOutlierThresh)
    y_bbox_limits = getValuesKSDsAwayMean(means_xy[1], sd_xy[1], sdOutlierThresh*noOfSD_bbox)
    
    randfloat_x = np.random.choice(np.concatenate((np.random.uniform(x_bbox_limits[0],x_outlier_limits[0],[1]),np.random.uniform(x_outlier_limits[1],x_bbox_limits[1],[1]))))
    randfloat_y = np.random.choice(np.concatenate((np.random.uniform(y_bbox_limits[0],y_outlier_limits[0],[1]),np.random.uniform(y_outlier_limits[1],y_bbox_limits[1],[1]))))
        
    return [randfloat_x, randfloat_y]


# Is outlier based on standard deviation 
# By default an outlier is a point that is +/- 3 standard deviations away from mean
# NOT FULLY CORRECT
def isOutlierBasedOnSD (pnt, means_xy, sd_xy, sdOutlierThresh=3):
    x_outlier_limits = getValuesKSDsAwayMean(means_xy[0], sd_xy[0], sdOutlierThresh)
    y_outlier_limits = getValuesKSDsAwayMean(means_xy[1], sd_xy[1], sdOutlierThresh)
    
    x, y = pnt
        
    return ( ((x < x_outlier_limits[0]) or (x > x_outlier_limits[1])) and ((y < y_outlier_limits[0]) or (y > y_outlier_limits[1])) )



## src: https://github.com/joferkington/oost_paper_code/blob/master/utilities.py
## explanation: http://stackoverflow.com/questions/22354094/pythonic-way-of-detecting-outliers-in-one-dimensional-observation-data
#def is_outlier(points,  3.5):
#    """
#    Returns a boolean array with True if points are outliers and False 
#    otherwise.

#    Parameters:
#    -----------
#        points : An numobservations by numdimensions array of observations
#        thresh : The modified z-score to use as a threshold. Observations with
#            a modified z-score (based on the median absolute deviation) greater
#            than this value will be classified as outliers.

#    Returns:
#    --------
#        mask : A numobservations-length boolean array.

#    References:
#    ----------
#        Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
#        Handle Outliers", The ASQC Basic References in Quality Control:
#        Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. 
#    """
#    if len(points.shape) == 1:
#        points = points[:,None]
#    median = np.median(points, axis=0)
#    diff = np.sum((points - median)**2, axis=-1)
#    diff = np.sqrt(diff)
#    med_abs_deviation = np.median(diff)

#    modified_z_score = 0.6745 * diff / med_abs_deviation

#    return modified_z_score > thresh
    

# generates a total of N random points from a standard normal distribution with manually added outliers  
def pntsOutliersMahalanobisDistance (N, K, O, corr = None, outputDirPath = "", filesuffix = ""):

    # calculate the number of outliers and the number of points per cluster
    noOfPointsPerCluster = int(N/K) 
    noOfRemainingPoints = N%K
    
    # get the correlation values to pick from randomly for the clusters
    #rs = np.linspace(-1.0, 1.0, num=10, endpoint=True)
    rs=np.arange(-0.75,1.,0.25)

    # initiate points array and the random stds and means
    pnts = []
    randfloats_std = np.random.uniform(0.0,10.0,[2])
    stds = [randfloats_std[0], randfloats_std[1]]
    
    #outlierRemThreshold = 2 #for catch
    outlierRemThreshold = 4

    # get random points for each cluster
    for k in range(0,K):
        if (corr == None):
            r_index = np.random.randint(0,rs.size-1)
            r = round(rs[r_index],2)
            rs = np.delete(rs,r_index)
        else: 
            r = corr
        
        randfloats_means = np.random.uniform(0.0,10.0,[2])
        means = [randfloats_means[0], randfloats_means[1]]
    
        n_k = noOfPointsPerCluster
        if (noOfRemainingPoints > 0):
            n_k = n_k + 1 
            noOfRemainingPoints = noOfRemainingPoints - 1
            
        pnts_k = corrpntsFromMultiVarNormalDist (7*n_k/10, means, stds, r, outputFilePath="")
        newstds = [stds[0] / 3.0, stds[1] / 3.0]
        pnts_k += corrpntsFromMultiVarNormalDist (3*n_k/10, means, newstds, r, outputFilePath="")
        

        #From now on new code from gregorio
        #after getting the blob, we remove the points with distance >= 4
        
        pointsToRemoveBoolFlags = is_outlier(np.array(pnts_k), outlierRemThreshold)
        farpoints_indices = np.where(pointsToRemoveBoolFlags[:] == True)
        pnts_k = np.delete(pnts_k, farpoints_indices, axis = 0)

        #if the total number of points has to be N, adding non outliers to pnts to have len(pnts) == total points - Number of outliers 
        numberOfNonOutliers = noOfPointsPerCluster - O
        diffNumPoints = len(pnts_k) - numberOfNonOutliers
        if diffNumPoints > 0:
            indices_to_remove = np.random.randint(0, len(pnts_k), diffNumPoints)
            pnts_k = np.delete(pnts_k, indices_to_remove, axis = 0)
        else:
            max_iter = 1000000
            curr_iter = 0
            while len(pnts_k) < numberOfNonOutliers and curr_iter < max_iter:

                new_point = corrpntsFromMultiVarNormalDist (1, means, stds, r, outputFilePath="")
                temp_array = np.append(pnts_k, new_point, axis = 0)

                if not is_outlier(temp_array, outlierRemThreshold)[len(temp_array)-1]:
                    pnts_k = np.append(pnts_k, new_point, axis = 0)
                    curr_iter = 0

                curr_iter += 1
        
        isoutlierIntFlags_forFiltered = np.zeros(len(pnts_k)).astype(np.int)
        isoutlierIntFlags_forFiltered = np.append(isoutlierIntFlags_forFiltered, np.empty((O,1), dtype = np.int))
        isoutlierIntFlags_forFiltered[len(pnts_k): len(isoutlierIntFlags_forFiltered)] = 1

        max_iter = 1000000
        curr_iter = 0
        std_spread_factor = 2.5
        while len(pnts_k) < noOfPointsPerCluster and curr_iter < max_iter:
            new_point = corrpntsFromMultiVarNormalDist (1, means, [stds[0] * std_spread_factor, stds[1] * std_spread_factor], 0, outputFilePath="")
            temp_array = np.append(pnts_k, new_point, axis = 0)
            term1 = is_outlier(temp_array, 6)[len(temp_array)-1]
            term2 = not is_outlier(temp_array, 8)[len(temp_array)-1]

            if term1 and term2:
                    pnts_k = np.append(pnts_k, new_point, axis = 0)
                    curr_iter = 0

            curr_iter += 1
    
        pnts_k_kor = [[x,y,k,o,r] for [x,y],o in zip(pnts_k.tolist(),isoutlierIntFlags_forFiltered.tolist())]
        pnts = pnts + pnts_k_kor
                 
    # save to file  
    if (outputDirPath != ""):
        if len(pnts) < N:
            print "WARNING: DIFFERENT SIZE EXPECTED FOR: " + "randNorm_"+str(N)+"pnts_"+str(K)+"clusters"+filesuffix+".csv"

        outputFilePath = outputDirPath + os.path.sep + "randNorm_"+str(N)+"pnts_"+str(K)+"clusters"+filesuffix+".csv" 
        savePntsWithClusterIdToCVSfile(pnts, outputFilePath, append=False)
    
    return np.array(pnts)



# See http://nbviewer.jupyter.org/gist/kevindavenport/7771325
def MahalanobisDistance(datapoints):
    Covariance = np.cov(datapoints, rowvar=0)
    InvCovariance = np.linalg.inv(Covariance)
    CenterEstimation = np.mean(datapoints, axis = 0)
    Diff = datapoints - CenterEstimation

    NumElems = len(datapoints)
    MahalanobisDistance = np.empty(NumElems)
    for i in xrange(NumElems):
        MahalanobisDistance[i] = np.sqrt(np.dot(np.dot(np.transpose(Diff[i]),InvCovariance),Diff[i]))

    return MahalanobisDistance


def is_outlier(datapoints, Threshold=3): #MahalanobisOutlierDetection
    MD = MahalanobisDistance(datapoints)
    Cutoff = np.mean(MD) * Threshold # adjust accordingly 
    return MD > Cutoff


def CreateDatasetAspectRatioFigure(dirpath, numberOfPoints):

    correlations  = [0.75, -0.25]

    stds = [2, 2]
    means = [0, 0]

    pnts_blob1 = corrpntsFromMultiVarNormalDist (numberOfPoints, means, stds, correlations[0], outputFilePath="")
    #pnts_blob1 = np.divide(pnts_blob1, np.linalg.norm(pnts_blob1))
    pnts_span1 = np.subtract(np.amax(pnts_blob1, axis = 0), np.amin(pnts_blob1, axis = 0))
    pnts_blob1 = np.subtract(pnts_blob1, np.amin(pnts_blob1, axis = 0))
    pnts_blob1[:,0] /= pnts_span1[0]
    pnts_blob1[:,1] /= pnts_span1[1]

    pnts_blob2 = corrpntsFromMultiVarNormalDist (numberOfPoints, means, stds, correlations[1], outputFilePath="")
    pnts_span2 = np.amax(pnts_blob2, axis = 0) - np.amin(pnts_blob2, axis = 0)
    pnts_blob2 = np.subtract(pnts_blob2, np.amin(pnts_blob2, axis = 0))
    pnts_blob2[:,0] /= pnts_span2[0]
    pnts_blob2[:,1] /= pnts_span2[1]

    pnts_blob2[:,0] += 1

    k_id = np.empty((numberOfPoints * 2))
    k_id[:numberOfPoints] = 0
    k_id[numberOfPoints : 2 * numberOfPoints] = 1

    o_id = np.zeros(numberOfPoints * 2)
    
    r_array = np.empty((numberOfPoints * 2))
    r_array[:numberOfPoints] = correlations[0]
    r_array[numberOfPoints : 2 * numberOfPoints] = correlations[1]

    pnts = np.concatenate((pnts_blob1, pnts_blob2))

    final_data = np.column_stack((pnts, k_id, o_id, r_array))

    outputFilePath = dirpath + os.path.sep + "randNorm_"+str(numberOfPoints)+"pnts_"+str(2)+"clusters_aspect_ratioex.csv" 
    savePntsWithClusterIdToCVSfile(final_data, outputFilePath, append=False)


def CreateDatasetClustersFigure(dirpath, numberOfPoints):

    correlations  = [0, 0.5, -0.5]

    stds = [2, 2]
    means = [0, 0]
    scale_fact = 2.

    pnts_blob1 = corrpntsFromMultiVarNormalDist (int(scale_fact) * numberOfPoints, means, stds, correlations[0], outputFilePath="")
    pnts_span1 = np.subtract(np.amax(pnts_blob1, axis = 0), np.amin(pnts_blob1, axis = 0))
    pnts_blob1 = np.subtract(pnts_blob1, np.amin(pnts_blob1, axis = 0))
    pnts_blob1[:,0] /= pnts_span1[0]
    pnts_blob1[:,1] /= pnts_span1[1]

    pnts_blob2 = corrpntsFromMultiVarNormalDist (numberOfPoints, means, stds, correlations[1], outputFilePath="")
    pnts_span2 = np.amax(pnts_blob2, axis = 0) - np.amin(pnts_blob2, axis = 0)
    pnts_blob2 = np.subtract(pnts_blob2, np.amin(pnts_blob2, axis = 0))
    pnts_blob2[:,0] /= pnts_span2[0]
    pnts_blob2[:,1] /= pnts_span2[1]

    pnts_blob3 = corrpntsFromMultiVarNormalDist (numberOfPoints, means, stds, correlations[2], outputFilePath="")
    pnts_span3 = np.amax(pnts_blob3, axis = 0) - np.amin(pnts_blob3, axis = 0)
    pnts_blob3 = np.subtract(pnts_blob3, np.amin(pnts_blob3, axis = 0))
    pnts_blob3[:,0] /= pnts_span3[0]
    pnts_blob3[:,1] /= pnts_span3[1]

    pnts_blob1 = np.subtract(pnts_blob1, np.mean(pnts_blob1, axis = 0))
    pnts_blob2 = np.subtract(pnts_blob2, np.mean(pnts_blob2, axis = 0))
    pnts_blob3 = np.subtract(pnts_blob3, np.mean(pnts_blob3, axis = 0))

    print np.mean(pnts_blob1, axis = 0)
    print np.mean(pnts_blob2, axis = 0)
    print np.mean(pnts_blob3, axis = 0)

    

    pnts_blob1[:,0] *= scale_fact
    #pnts_blob2 = np.multiply(pnts_blob2, 1./(scale_fact*0.5))
    #pnts_blob3 = np.multiply(pnts_blob3, 1./(scale_fact*0.5))

    pnts_blob2[:,0] -= 0.25
    pnts_blob3[:,0] += 0.25


    k_id = np.empty(len(pnts_blob1) + len(pnts_blob2) + len(pnts_blob3))
    k_id[:len(pnts_blob1)] = 0
    k_id[len(pnts_blob1) :  len(pnts_blob1) + len(pnts_blob2)] = 1
    k_id[len(pnts_blob1) + len(pnts_blob2): len(pnts_blob1) + len(pnts_blob2) + len(pnts_blob3)] = 2

    o_id = np.zeros(len(pnts_blob1) + len(pnts_blob2) + len(pnts_blob3))
    
    r_array = np.empty(len(pnts_blob1) + len(pnts_blob2) + len(pnts_blob3))
    r_array[:len(pnts_blob1)] = correlations[0]
    r_array[len(pnts_blob1) :  len(pnts_blob1) + len(pnts_blob2)] = correlations[1]
    r_array[len(pnts_blob1) + len(pnts_blob2): len(pnts_blob1) + len(pnts_blob2) + len(pnts_blob3)] = correlations[2]

    pnts = np.concatenate((pnts_blob1, pnts_blob2, pnts_blob3))

    final_data = np.column_stack((pnts, k_id, o_id, r_array))

    outputFilePath = dirpath + os.path.sep + "randNorm_"+str(numberOfPoints)+"pnts_"+str(3)+"clusters_clusters_ex.csv" 
    savePntsWithClusterIdToCVSfile(final_data, outputFilePath, append=False)


def CreateDatasetIntroClusterStudy(dirpath, numberOfPoints = 1000):
    
    correlations  = [0.5, -0.5, 0, 0.5, -0.5]
    pnts_blobs = []
    k_id = []
    r = []

    stds = [2, 2]
    means = [0, 0]

    for i, correlation in enumerate(correlations):

        pnts_blob = corrpntsFromMultiVarNormalDist (numberOfPoints, means, stds, correlation, outputFilePath="")
        pnts_span = np.subtract(np.amax(pnts_blob, axis = 0), np.amin(pnts_blob, axis = 0))
        pnts_blob = np.subtract(pnts_blob, np.amin(pnts_blob, axis = 0))
        pnts_blob[:,0] /= pnts_span[0]
        pnts_blob[:,1] /= pnts_span[1]
        pnts_blobs.append(pnts_blob)
        
        k_id.append(np.empty(numberOfPoints, dtype = np.int32))
        k_id[-1][:] = i
        r.append(np.empty(numberOfPoints, dtype = np.int32))
        r[-1][:] = correlation

    pnts_blobs[0][:,0] -= 0.5
    pnts_blobs[0][:,1] += 0.5
    pnts_blobs[1][:,0] -= 0.5
    pnts_blobs[1][:,1] -= 0.5

    pnts_blobs[2][:,0] += 0.5
    pnts_blobs[2][:,1] -= 0.5

    

    pnts_blobs[3][:,0] *= 1.25
    pnts_blobs[3][:,1] *= 1.25

    pnts_blobs[3][:,0] += 0.25
    pnts_blobs[3][:,1] += 0.25

    pnts_blobs[4][:,0] *= 1.5
    pnts_blobs[4][:,1] *= 1.5

    pnts_blobs[4][:,0] -= 0.4
    pnts_blobs[4][:,1] -= 0.4

    final_blobs = np.concatenate((pnts_blobs[:]))
    final_k = np.concatenate((k_id[:]))
    final_r = np.concatenate((r[:]))

    o = np.zeros(numberOfPoints * len(correlations))

    final_data = np.column_stack((final_blobs, final_k, o, final_r))

    outputFilePath = dirpath + os.path.sep + "randNorm_"+str(numberOfPoints)+"pnts_"+str(5)+"clusters_intro_clusters.csv" 
    savePntsWithClusterIdToCVSfile(final_data, outputFilePath, append=False)


"""
# MAIN

# Generate datasets with 1 cluster but different correlation
#noOfPoints = [100,500,1000,10000]
#dirpath = "../data/test2"
#generateDataset1ClusterDifferentCorrelations(1000,dirpath)
"""




"""
Generating Calibration Study Data
"""

"""
# For CORRELATION
dirpath = "../data/studydata/corr"
noOfClusters = range(1,2)
noOfPoints = [100,500,1000,10000]
rs=np.arange(-1.0,1.25,0.25)
for cn in noOfClusters:
    for pn in noOfPoints:
        r_i=0
        for r in rs:
            pntsFromNormalDistWithKClustersAndOutliers(pn, cn, corr=r, outputDirPath=dirpath, filesuffix="_"+str(r_i)) #str(r))  
            r_i+=1
"""

"""
# For CLUSTERING
dirpath = "../data/studydata/clusters"
noOfClusters = range(2,6)
noOfPoints = [100,500,1000,10000]
reps = 5
for cn in noOfClusters:
    for pn in noOfPoints:
        rep_i = 0
        for rep in range (0,reps):
            pntsFromNormalDistWithKClustersAndOutliers(pn, cn, outputDirPath=dirpath, filesuffix="_"+str(rep_i))
            rep_i += 1
"""

"""

# For OUTLIERS
dirpath = "../data/studydata/outliers"
noOfClusters = range(1,2)
noOfPoints = [10000,1000,500,100]
#noOfPoints = [10000] #for catch
#noOfOutliers=range(1,10)
noOfOutliers=range(1,6)
#noOfOutliers = [0] #for catch
for cn in noOfClusters:
    for pn in noOfPoints:
        for on in noOfOutliers:
            pntsOutliersMahalanobisDistance(pn, cn, on, outputDirPath=dirpath, filesuffix="_"+str(on))
            #pntsOutliersMahalanobisDistance(pn, cn, on, corr = 0.5, outputDirPath=dirpath, filesuffix="_"+str(on)+"_catch") #for catch
           
"""

#dirpath = "../data/studydata/clusters"
#CreateDatasetIntroClusterStudy(dirpath)

##CreateDatasetAspectRatioFigure("../data/figuresData" ,1000)
#CreateDatasetClustersFigure("../data/figuresData" ,1000)