-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathiCROUST.py
63 lines (54 loc) · 2.8 KB
/
iCROUST.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Creator: Atif Hassan
# email-id: [email protected]
# email-id: [email protected]
'''
Copyright [2019] [Atif Hassan]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import numpy as np
import math
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.neighbors import NearestNeighbors
def icroust(X, Y, points_to_remove, n_neighbours, points_to_remove_at_a_time=4, num_clusters=10000, maj_class=0, min_class=1):
X_minority = X[np.where(Y==min_class)[0]]
X_majority = X[np.where(Y==maj_class)[0]]
indices = list()
if cluster_centers > len(X_minority):
dists_cent = np.array([min([np.linalg.norm(i-j) for j in X_minority]) for i in X_majority])
else:
#Cluster minority samples
kmeans = MiniBatchKMeans(n_clusters=num_clusters, max_iter=100, random_state=0).fit(X_minority)
#1. Find centroids
centroids = kmeans.cluster_centers_
#2. Get distance of majority class samples from closest centroid
dists_cent = np.array([min([np.linalg.norm(i-j) for j in centroids]) for i in X_majority])
print("Points to remove = "+str(points_to_remove))
#for k in range(0, points_to_remove, points_to_remove_at_a_time):
for k in range(math.floor(points_to_remove/points_to_remove_at_a_time)):
#3. Sort majority samples by above distance in ascending order (could be important samples. Sort in descending order ?)
#4. Get average distance with k=3 nearest neighbour of majority samples
neigh = NearestNeighbors(n_neighbors=n_neighbours+1, algorithm='kd_tree', n_jobs=-1).fit(X_majority)
dist, inds = neigh.kneighbors(X_majority)
dists_nn = np.array([sum(dist[i])/n_neighbours for i in range(len(X_majority))])
samples = [[i, dists_cent[i]+dists_nn[i]] for i in range(len(X_majority))]
samples.sort(key=lambda z: z[1])
#6. Now remove top k majority samples
indices+= [i[0] for i in samples[:points_to_remove_at_a_time]]
X_majority = np.delete(X_majority, [i[0] for i in samples[:points_to_remove_at_a_time]], axis=0)
X, Y = list(), list()
for i in X_majority:
X.append(i)
Y.append(maj_class)
for i in X_minority:
X.append(i)
Y.append(min_class)
return np.array(X), np.array(Y)
return X, Y