Implementing K-medoids in Pyspark

Implementing K-medoids in Pyspark - pyspark

I can not find a library to use PAM (K-medoids) in Pyspark.
I have found this in Scala :
https://gist.github.com/erikerlandson/c3c35f0b1aae737fc884
And this issue in Spark which was resolved in 2016 :
https://issues.apache.org/jira/browse/SPARK-4510
https://github.com/apache/spark/pull/3382
But it seems not to be working and this is not included in the mllib documentation :
http://spark.apache.org/docs/2.0.0/api/python/pyspark.mllib.html#module-pyspark.mllib.clustering
Does anyone knows any library for PAM in Pyspark ?
Thank you

I actually had a go at this the other day for fun. Can't say much about performance as I'm quite new to spark. But here is KMedoids with K++ seeding:
# (c) 2020 Jonathan Kelsey
# This code is licensed under MIT license
from pyspark.sql import functions as F
import pyspark
import numpy as np
import sys
def seed_kernel(data_broadcast, data_id_value, centeroids, k, metric):
data = data_broadcast.value
point = data_id_value[1]
min_distance = sys.maxsize
for j in range(len(centeroids)):
distance = metric(point, data[centeroids[j]])
min_distance = min(min_distance, distance)
return min_distance
def seed_clusters(data_broadcast, data_frame, k, metric):
data = data_broadcast.value
centeroids = list(np.random.choice(data.shape[0], 1, replace=False))
for i in range(k - 1):
print("clusterSeed", i)
distances = []
mK = data_frame.rdd.map(lambda data_id_value: seed_kernel(data_broadcast, data_id_value, centeroids, k, metric))
mK_collect = mK.collect()
distances = np.array(mK_collect)
next_centeroid = np.argmax(distances)
centeroids.append(next_centeroid)
print("centeroids", centeroids)
return centeroids
def nearest_centeroid_kernel(data_id_value, centeroid_id_values, metric):
_, data_value = data_id_value
data_np = np.asarray(data_value)
distances = []
for _, centeroid_value in centeroid_id_values:
centeroid_np = np.asarray(centeroid_value)
distance = metric(data_np, centeroid_np)
distances.append(distance)
distances = np.asarray(distances)
closest_centeroid = np.argmin(distances)
return int(closest_centeroid)
def optimise_cluster_membership_spark(data, data_frame, n, metric, intital_cluster_indices=None):
data_shape = data.shape
data_rdd = data_frame.rdd
data_length = data_shape[0]
if intital_cluster_indices is None:
index = np.random.choice(data_length, n, replace=False)
else:
index = intital_cluster_indices
list_index = [int(i) for i in list(index)]
centeroid_id_values = [(i,data[index[i]]) for i in range(len(index))]
data_rdd = data_rdd.filter(lambda data_id_value: int(data_id_value["id"]) not in list_index)
associated_cluster_points = data_rdd.map(lambda data_id_value: (data_id_value[0],nearest_centeroid_kernel(data_id_value, centeroid_id_values, metric)))
clusters = associated_cluster_points.toDF(["id", "bestC"]).groupBy("bestC").agg(F.collect_list("id").alias("cluster"))
return index, clusters
def cost_kernel(data_broadcast, test_centeroid, cluster_data, metric):
data = data_broadcast.value
cluster = np.asarray(cluster_data)
cluster_length = cluster.shape[0]
feature_length = data.shape[1]
test_centeroid_column = np.zeros(shape=(cluster_length, feature_length), dtype=data.dtype)
new_cluster_column = np.zeros(shape=(cluster_length, feature_length), dtype=data.dtype)
for i in range(0, cluster_length):
new_cluster_column[i] = data[cluster[i]]
test_centeroid_column[i] = data[int(test_centeroid)]
pairwise_distance = metric(new_cluster_column, test_centeroid_column)# (np.absolute(new_cluster_column-test_centeroid_column).sum(axis=1))# metric(new_cluster_column, test_centeroid_column)
cost = np.sum(pairwise_distance)
return float(cost) #new_cluster_column.shape[1]
def optimise_centroid_selection_spark(data_broadcast, data_frame, centeroids, clusters_frames, metric):
data = data_broadcast.value
new_centeroid_ids = []
total_cost = 0
for cluster_idx in range(len(centeroids)):
old_centeroid = centeroids[cluster_idx]
cluster_frame = clusters_frames.filter(clusters_frames.bestC == cluster_idx).select(F.explode(clusters_frames.cluster))
cluster_data = cluster_frame.collect()
if cluster_data:
cluster_data = [cluster_data[i].col for i in range(len(cluster_data))]
else:
cluster_data = []
cost_data = cluster_frame.rdd.map(lambda point_id: (point_id[0], cost_kernel(data_broadcast, point_id[0], cluster_data, metric)))
cost = cost_data.map(lambda point_id_cost: point_id_cost[1]).sum()
total_cost = total_cost + cost
point_result = cost_data.sortBy(lambda point_id_cost: point_id_cost[1]).take(1)
if (point_result):
best_point = point_result[0][0]
else:
best_point = old_centeroid
new_centeroid_ids.append(best_point)
return (new_centeroid_ids, total_cost)
def validate_metric(metric):
if (metric == "euclidean" or metric == "hamming"):
return True
if isinstance(metric, dict) == False:
return "Metric is not a dictionary. And not a known string 'euclidean' or 'hamming'"
metric_keys = metric.keys()
if "point" not in metric_keys or "vector" not in metric_keys:
return "Metric does not contain a member function for 'point' and/or 'point'."
if callable(metric["point"]) == False or callable(metric["vector"]) == False:
return "Metric.point and/or Metric.vector are not callable functions."
if (metric["point"].__code__.co_argcount != 2 and metric["vector"].__code__.co_argcount != 2):
return "Metric.point and/or Metric.vector do not both have 2 arguments."
return True
# pre-defined metrics
#vector metrics
def hamming_vector(stack1, stack2):
return (stack1 != stack2).sum(axis=1)
def euclidean_vector(stack1, stack2):
#return (np.absolute(stack2-stack1)).sum(axis=1)
return np.sqrt(((stack2-stack1)**2).sum(axis=1))
# point metrics
def hamming_point(p1, p2):
return np.sum((p1 != p2))
def euclidean_point(p1, p2):
return np.sqrt(np.sum((p1 - p2)**2))
def fit(sc, data, n_clusters = 2, metric = "euclidean", seeding = "heuristic"):
metric_valid = validate_metric(metric)
if metric_valid == True:
if metric == "euclidean":
point_metric = euclidean_point
vector_metric = euclidean_vector
elif metric == "hamming":
point_metric = hamming_point
vector_metric = hamming_vector
else:
point_metric = metric["point"]
vector_metric = metric["vector"]
else:
print(metric_valid)
return
data_np = np.asarray(data)
data_broadcast = sc.broadcast(data_np)
seeds = None
data_frame = sc.parallelize(data).zipWithIndex().map(lambda xy: (xy[1],xy[0])).toDF(["id", "vector"]).cache()
if (seeding == "heuristic"):
seeds = list(seed_clusters(data_broadcast, data_frame, n_clusters, point_metric))
last_centeroids, last_clusters = optimise_cluster_membership_spark(data_np, data_frame, n_clusters, point_metric, seeds)
last_cost = float('inf')
iteration = 0
escape = False
while not escape:
iteration = iteration + 1
current_centeroids, current_cost = optimise_centroid_selection_spark(data_broadcast, data_frame, last_centeroids, last_clusters, vector_metric)
current_centeroids, current_clusters = optimise_cluster_membership_spark(data_np, data_frame, n_clusters, point_metric, current_centeroids)
print((current_cost<last_cost, current_cost, last_cost, current_cost - last_cost))
if (current_cost<last_cost):
print(("iteration",iteration,"cost improving...", current_cost, last_cost, current_centeroids))
last_cost = current_cost
last_centeroids = current_centeroids
last_clusters = current_clusters
else:
print(("iteration",iteration,"cost got worse or did not improve", current_cost, last_cost))
escape = True
bc = last_clusters.sort("bestC", ascending=True).collect()
unpacked_clusters = [bc[i].cluster for i in range(len(bc))]
return (last_centeroids, unpacked_clusters)
I used some sample data from pyclustering as a sanity check:
from pyclustering.cluster import cluster_visualizer
from pyclustering.utils import read_sample
from pyclustering.samples.definitions import FCPS_SAMPLES
from pyclustering.samples.definitions import SIMPLE_SAMPLES
sample = read_sample(FCPS_SAMPLES.SAMPLE_GOLF_BALL)
bestCentroids, bestClusters = fit(sc, sample, 9)
visualizer = cluster_visualizer()
visualizer.append_clusters(bestClusters, sample)
visualizer.show()

Your best choice is to adapt this Python implementation into Scala so you take advance of RDD partitions and distributed computation.
https://github.com/letiantian/kmedoids/blob/master/kmedoids.py

Related

Confused function in python

dosen't work when i try to excuted it's stock not work
def calculate_utility_cost(visitors) :
movies = ['Gifted Hands', 'Legends of the Fall', 'Patch Adams',
'The Sixth Sense', 'A Beautiful Mind']
visitors_per_movie = np.repeat(0, len(movies))
pred_bill_per_week = 100000/4
pred_visitors_per_week = 14413
return pred_bill_per_week / pred_visitors_per_week * visitors
utility_total_cost1 = int(input(calculate_utility_cost))
utility_total_cost2 = (visitors_per_day)
utility_total_cost= sum = utility_total_cost1 +
utility_total_cost2
print(utility_total_cost)

How to compare the data entered from the program and the data in MongoDB?

I want to compare the data entered from the program with the data in MongoDB. How can I do it?
import pymongo
from getpass import getpass
baglanti = pymongo.MongoClient("mongodb+srv://user:secure#denemeler.s4ufh.mongodb.net/Bankamatik?authSource=admin&w=majority&readPreference=primary&appname=MongoDB%20Compass&retryWrites=true&ssl=true")
bankamatik = baglanti["Bankamatik"]
hesap = bankamatik["BankaHesapBilgileri"]
def kayitOl():
kullaniciadi = input("Kayıt bölümüne hoş geldiniz. \nKullanıcı adı: ")
sifre = getpass("Şifre(gizli): ")
hesap.insert({"kullaniciadi": kullaniciadi, "sifre": sifre, "bakiye": 1000})
print("Kayıt başarıyla oluşturulmuştur.")
print("Hoş geldiniz.")
girismenu = int(input("Hoş geldiniz. Giriş yapmak isterseniz 1, hesabınız yok ve kayıt olmak isterseniz 2 yazınız."))
if girismenu == 2:
kayitOl()
if girismenu == 1:
print("Giriş yapma kısmındasınız. Lütfen bilgilerinizi girin.")
kullaniciadi = input("Kullanıcı adı: ")
sifre = getpass("Şifre(gizli): ")
arama = hesap.find().sort("kullaniciadi")
for x in arama:
print(x["kullaniciadi"])
I stayed here and couldn't continue.
Example:
Database: kullaniciadi: "emro", sifre: 33
Program input: kullaniciadi: "emro", sifre: 33
Login successful.
How can I do it?

Pass the filter parameters into find() or find_one()
e.g.
user_input = input("Input")
records = hesap.find({"field": user_input})
or
user_input = input("Input")
record = hesap.find_one({"field": user_input})

Can we give Max & Min values statically in normalization using MinMaxScaler Sklearn?

So, I have this doubt and have been looking for answers.
Below is the input post request
{
"emotive_Score": [0.89,0.57,0.089,0,0.004,0,0],
"sentiment_Score": [1.521894,-6.4523187],
"mood_score":[40]
}
And I'm using the following code to scale the values.
from flask import Flask, request
from flask_restful import Resource, Api
from json import dumps
from sklearn import preprocessing
import numpy as np
class MoodScore(Resource):
def post(self):
json_data = request.get_json(force=True)
if not json_data:
return {'message': 'No input data provided'}, 400
x = request.json['emotive_Score']
x1 = request.json['sentiment_Score']
x2 = request.json['mood_score']
#Normalisation for Emotive Score
xEmotive = np.array(x)
PositiveEmotive = str(xEmotive[4]+xEmotive[6])
NegativeEmotive = str(xEmotive[0]+xEmotive[1]+xEmotive[2]+xEmotive[3]+xEmotive[5])
EmotiveScoreArray = (PositiveEmotive,NegativeEmotive)
Nml = np.array(EmotiveScoreArray)
float_array = Nml.astype(np.float)
xM = float_array.reshape(-1,1)
minmaxscaler = preprocessing.MinMaxScaler(feature_range=(0,1))
Emotive = minmaxscaler.fit_transform(xM)
#Normalisation for Sentiment Score
xSentiment = np.array(x1)
PositiveSentiment = str(xSentiment[0])
NegativeSentiment = str(xSentiment[1])
SentimentScoreArray = (PositiveSentiment,NegativeSentiment)
Nml1 = np.array(SentimentScoreArray)
float_array1 = Nml1.astype(np.float)
xM1 = float_array1.reshape(-1,1)
minmaxscaler1 = preprocessing.MinMaxScaler(feature_range=(-1,1))
Sentiment = minmaxscaler1.fit_transform(xM1)
return {'PositiveEmotive':str(Emotive[0]),'NegativeEmotive':str(Emotive[1]),'PositiveSentiment':str(Sentiment[0]),'NegativeSentiment':str(Sentiment[1]),'FinalValue':str(Emotive[0]+Emotive[1]+Sentiment[0]+Sentiment[1])}
# return {'FinalScore': str(Sentiment)}
app = Flask(__name__)
api = Api(app)
api.add_resource(MoodScore, '/moodScore')
if __name__ == '__main__':
app.run(port='5005', host="0.0.0.0")
And I'm getting the following as output.
{
"PositiveEmotive": "[0.]",
"NegativeEmotive": "[1.]",
"PositiveSentiment": "[1.]",
"NegativeSentiment": "[-1.]",
"FinalValue": "[1.]"
}
I just want to know whether I can give static values to Min & Max during normalization calculation so that I can get the desired result as below
{
"PositiveEmotive": "[0.546]",
"NegativeEmotive": "[1.]",
"PositiveSentiment": "[0.598]",
"NegativeSentiment": "[-0.6879.]",
"FinalValue": "[1.4561]"
}

How make custom dataset for classification task when the class is folder name using Pytorch?

The problem is dataloader is returning the wrong class for correspond image?
for example if I print the class_to_idx from the train_loader, when batch size is 1I was expecting to get one class per batch, but currently it’s returning all the classes which is 15 classes per image.
In this case, the classes are folder class (all images exist in one folder belongs to one class)
snippet is here:(this is a function to return the class from the folder name dir)
import os
def find_classes(dir): # Finds the class folders in a dataset, dir (string): Root directory path.
classes = [d.name for d in os.scandir(dir) if d.is_dir()]
classes.sort()
class_to_idx = {classes[i]: i for i in range(len(classes))}
return classes, class_to_idx
here is the main snippet for create a custom dataset and dataloder
def main():
class CustomDataset(Dataset):
def __init__(self, image_paths, classes, class_to_id):
self.image_paths = image_paths
self.transforms = transforms.ToTensor()
classes, class_to_id = find_classes('D:/Neda/Echo_View_Classification/avi_images/')
self.classes = classes
self.class_to_idx = class_to_idx
def __getitem__(self, index):
image = Image.open(self.image_paths[index])
t_image = image.convert('L')
t_image = self.transforms(t_image)
class_to_idx = self.class_to_idx
return t_image, class_to_idx, self.image_paths[index]
def __len__(self):
return len(self.image_paths)
folder_data = glob.glob("D:\\Neda\\Echo_View_Classification\\avi_images\\*\\*.png") # no augmnetation
#numpy.savetxt('distribution_class.csv', numpy.c_[folder_data], fmt=['%s'], comments='', delimiter = ",")
#split these path using a certain percentage
len_data = len(folder_data)
print("count of dataset: ", len_data)
split_1 = int(0.6 * len(folder_data))
split_2 = int(0.8 * len(folder_data))
folder_data.sort()
train_image_paths = folder_data[:split_1]
print("count of train images is: ", len(train_image_paths))
numpy.savetxt('im_training_path_1.csv', numpy.c_[train_image_paths], fmt=['%s'], comments='', delimiter = ",")
valid_image_paths = folder_data[split_1:split_2]
print("count of validation image is: ", len(valid_image_paths))
numpy.savetxt('im_valid_path_1.csv', numpy.c_[valid_image_paths], fmt=['%s'], comments='', delimiter = ",")
test_image_paths = folder_data[split_2:]
print("count of test images is: ", len(test_image_paths))
numpy.savetxt('im_testing_path_1.csv', numpy.c_[test_image_paths], fmt=['%s'], comments='', delimiter = ",")
classes = ['1_PLAX_1_PLAX_full',
'1_PLAX_2_PLAX_valves',
'1_PLAX_4_PLAX_TV',
'2_PSAX_1_PSAX_AV',
'2_PSAX_2_PSAX_LV',
'3_Apical_1_MV_LA_IAS',
'3_Apical_2_A2CH',
'3_Apical_3_A3CH',
'3_Apical_5_A5CH',
'4_A4CH_1_A4CH_LV',
'4_A4CH_2_A4CH_RV',
'4_Subcostal_1_Subcostal_heart',
'4_Subcostal_2_Subcostal_IVC',
'root_5_Suprasternal',
'root_6_OTHER']
class_to_idx = {'1_PLAX_1_PLAX_full': 0,
'1_PLAX_2_PLAX_valves': 1,
'1_PLAX_4_PLAX_TV': 2,
'2_PSAX_1_PSAX_AV': 3,
'2_PSAX_2_PSAX_LV': 4,
'3_Apical_1_MV_LA_IAS': 5,
'3_Apical_2_A2CH': 6,
'3_Apical_3_A3CH': 7,
'3_Apical_5_A5CH': 8,
'4_A4CH_1_A4CH_LV': 9,
'4_A4CH_2_A4CH_RV': 10,
'4_Subcostal_1_Subcostal_heart': 11,
'4_Subcostal_2_Subcostal_IVC': 12,
'root_5_Suprasternal': 13,
'root_6_OTHER': 14}
train_dataset = CustomDataset(train_image_paths, class_to_idx, classes)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=0)
valid_dataset = CustomDataset(valid_image_paths, class_to_idx, classes)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=0)
test_dataset = CustomDataset(test_image_paths, class_to_idx, classes)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=0)
dataLoaders = {
'train': train_loader,
'valid': valid_loader,
'test': test_loader,
}

I think ImageFolder from torchvision.datasets will help you in loading your data.

Efficient way to optimise a Scala code to read large file that doesn't fit in memory

Problem Statement Below,
We have a large log file which stores user interactions with an application. The entries in the log file follow the following schema: {userId, timestamp, actionType} where actionType is one of two possible values: [open, close]
Constraints:
The log file is too big to fit in memory on one machine. Also assume that the aggregated data doesn’t fit into memory.
Code has to be able to run on a single machine.
Should not use an out-of-the box implementation of mapreduce or 3rd party database; don’t assume we have a Hadoop or Spark or other distributed computing framework.
There can be multiple entries of each actionType for each user, and there might be missing entries in the log file. So a user might be missing a close record between two open records or vice versa.
Timestamps will come in strictly ascending order.
For this problem, we need to implement a class/classes that computes the average time spent by each user between open and close. Keep in mind that there are missing entries for some users, so we will have to make a choice about how to handle these entries when making our calculations. Code should follow a consistent policy with regards to how we make that choice.
The desired output for the solution should be [{userId, timeSpent},….] for all the users in the log file.
Sample log file (comma-separated, text file)
1,1435456566,open
2,1435457643,open
3,1435458912,open
1,1435459567,close
4,1435460345,open
1,1435461234,open
2,1435462567,close
1,1435463456,open
3,1435464398,close
4,1435465122,close
1,1435466775,close
Approach
Below is the code I've written in Python & Scala, which seems to be not efficient and upto the expectations of the scenario given, I'd like to feedback from community of developers in this forum how better we could optimise this code as per given scenario.
Scala implementation
import java.io.FileInputStream
import java.util.{Scanner, Map, LinkedList}
import java.lang.Long
import scala.collection.mutable
object UserMetrics extends App {
if (args.length == 0) {
println("Please provide input data file name for processing")
}
val userMetrics = new UserMetrics()
userMetrics.readInputFile(args(0),if (args.length == 1) 600000 else args(1).toInt)
}
case class UserInfo(userId: Integer, prevTimeStamp: Long, prevStatus: String, timeSpent: Long, occurence: Integer)
class UserMetrics {
val usermap = mutable.Map[Integer, LinkedList[UserInfo]]()
def readInputFile(stArr:String, timeOut: Int) {
var inputStream: FileInputStream = null
var sc: Scanner = null
try {
inputStream = new FileInputStream(stArr);
sc = new Scanner(inputStream, "UTF-8");
while (sc.hasNextLine()) {
val line: String = sc.nextLine();
processInput(line, timeOut)
}
for ((key: Integer, userLs: LinkedList[UserInfo]) <- usermap) {
val userInfo:UserInfo = userLs.get(0)
val timespent = if (userInfo.occurence>0) userInfo.timeSpent/userInfo.occurence else 0
println("{" + key +","+timespent + "}")
}
if (sc.ioException() != null) {
throw sc.ioException();
}
} finally {
if (inputStream != null) {
inputStream.close();
}
if (sc != null) {
sc.close();
}
}
}
def processInput(line: String, timeOut: Int) {
val strSp = line.split(",")
val userId: Integer = Integer.parseInt(strSp(0))
val curTimeStamp = Long.parseLong(strSp(1))
val status = strSp(2)
val uInfo: UserInfo = UserInfo(userId, curTimeStamp, status, 0, 0)
val emptyUserInfo: LinkedList[UserInfo] = new LinkedList[UserInfo]()
val lsUserInfo: LinkedList[UserInfo] = usermap.getOrElse(userId, emptyUserInfo)
if (lsUserInfo != null && lsUserInfo.size() > 0) {
val lastUserInfo: UserInfo = lsUserInfo.get(lsUserInfo.size() - 1)
val prevTimeStamp: Long = lastUserInfo.prevTimeStamp
val prevStatus: String = lastUserInfo.prevStatus
if (prevStatus.equals("open")) {
if (status.equals(lastUserInfo.prevStatus)) {
val timeSelector = if ((curTimeStamp - prevTimeStamp) > timeOut) timeOut else curTimeStamp - prevTimeStamp
val timeDiff = lastUserInfo.timeSpent + timeSelector
lsUserInfo.remove()
lsUserInfo.add(UserInfo(userId, curTimeStamp, status, timeDiff, lastUserInfo.occurence + 1))
} else if(!status.equals(lastUserInfo.prevStatus)){
val timeDiff = lastUserInfo.timeSpent + curTimeStamp - prevTimeStamp
lsUserInfo.remove()
lsUserInfo.add(UserInfo(userId, curTimeStamp, status, timeDiff, lastUserInfo.occurence + 1))
}
} else if(prevStatus.equals("close")) {
if (status.equals(lastUserInfo.prevStatus)) {
lsUserInfo.remove()
val timeSelector = if ((curTimeStamp - prevTimeStamp) > timeOut) timeOut else curTimeStamp - prevTimeStamp
lsUserInfo.add(UserInfo(userId, curTimeStamp, status, lastUserInfo.timeSpent + timeSelector, lastUserInfo.occurence+1))
}else if(!status.equals(lastUserInfo.prevStatus))
{
lsUserInfo.remove()
lsUserInfo.add(UserInfo(userId, curTimeStamp, status, lastUserInfo.timeSpent, lastUserInfo.occurence))
}
}
}else if(lsUserInfo.size()==0){
lsUserInfo.add(uInfo)
}
usermap.put(userId, lsUserInfo)
}
}
Python Implementation
import sys
def fileBlockStream(fp, number_of_blocks, block):
#A generator that splits a file into blocks and iterates over the lines of one of the blocks.
assert 0 <= block and block < number_of_blocks #Assertions to validate number of blocks given
assert 0 < number_of_blocks
fp.seek(0,2) #seek to end of file to compute block size
file_size = fp.tell()
ini = file_size * block / number_of_blocks #compute start & end point of file block
end = file_size * (1 + block) / number_of_blocks
if ini <= 0:
fp.seek(0)
else:
fp.seek(ini-1)
fp.readline()
while fp.tell() < end:
yield fp.readline() #iterate over lines of the particular chunk or block
def computeResultDS(chunk,avgTimeSpentDict,defaultTimeOut):
countPos,totTmPos,openTmPos,closeTmPos,nextEventPos = 0,1,2,3,4
for rows in chunk.splitlines():
if len(rows.split(",")) != 3:
continue
userKeyID = rows.split(",")[0]
try:
curTimeStamp = int(rows.split(",")[1])
except ValueError:
print("Invalid Timestamp for ID:" + str(userKeyID))
continue
curEvent = rows.split(",")[2]
if userKeyID in avgTimeSpentDict.keys() and avgTimeSpentDict[userKeyID][nextEventPos]==1 and curEvent == "close":
#Check if already existing userID with expected Close event 0 - Open; 1 - Close
#Array value within dictionary stores [No. of pair events, total time spent (Close tm-Open tm), Last Open Tm, Last Close Tm, Next expected Event]
curTotalTime = curTimeStamp - avgTimeSpentDict[userKeyID][openTmPos]
totalTime = curTotalTime + avgTimeSpentDict[userKeyID][totTmPos]
eventCount = avgTimeSpentDict[userKeyID][countPos] + 1
avgTimeSpentDict[userKeyID][countPos] = eventCount
avgTimeSpentDict[userKeyID][totTmPos] = totalTime
avgTimeSpentDict[userKeyID][closeTmPos] = curTimeStamp
avgTimeSpentDict[userKeyID][nextEventPos] = 0 #Change next expected event to Open
elif userKeyID in avgTimeSpentDict.keys() and avgTimeSpentDict[userKeyID][nextEventPos]==0 and curEvent == "open":
avgTimeSpentDict[userKeyID][openTmPos] = curTimeStamp
avgTimeSpentDict[userKeyID][nextEventPos] = 1 #Change next expected event to Close
elif userKeyID in avgTimeSpentDict.keys() and avgTimeSpentDict[userKeyID][nextEventPos]==1 and curEvent == "open":
curTotalTime,closeTime = missingHandler(defaultTimeOut,avgTimeSpentDict[userKeyID][openTmPos],curTimeStamp)
totalTime = curTotalTime + avgTimeSpentDict[userKeyID][totTmPos]
avgTimeSpentDict[userKeyID][totTmPos]=totalTime
avgTimeSpentDict[userKeyID][closeTmPos]=closeTime
avgTimeSpentDict[userKeyID][openTmPos]=curTimeStamp
eventCount = avgTimeSpentDict[userKeyID][countPos] + 1
avgTimeSpentDict[userKeyID][countPos] = eventCount
elif userKeyID in avgTimeSpentDict.keys() and avgTimeSpentDict[userKeyID][nextEventPos]==0 and curEvent == "close":
curTotalTime,openTime = missingHandler(defaultTimeOut,avgTimeSpentDict[userKeyID][closeTmPos],curTimeStamp)
totalTime = curTotalTime + avgTimeSpentDict[userKeyID][totTmPos]
avgTimeSpentDict[userKeyID][totTmPos]=totalTime
avgTimeSpentDict[userKeyID][openTmPos]=openTime
eventCount = avgTimeSpentDict[userKeyID][countPos] + 1
avgTimeSpentDict[userKeyID][countPos] = eventCount
elif curEvent == "open":
#Initialize userid with Open event
avgTimeSpentDict[userKeyID] = [0,0,curTimeStamp,0,1]
elif curEvent == "close":
#Initialize userid with missing handler function since there is no Open event for this User
totaltime,OpenTime = missingHandler(defaultTimeOut,0,curTimeStamp)
avgTimeSpentDict[userKeyID] = [1,totaltime,OpenTime,curTimeStamp,0]
def missingHandler(defaultTimeOut,curTimeVal,lastTimeVal):
if lastTimeVal - curTimeVal > defaultTimeOut:
return defaultTimeOut,curTimeVal
else:
return lastTimeVal - curTimeVal,curTimeVal
def computeAvg(avgTimeSpentDict,defaultTimeOut):
resDict = {}
for k,v in avgTimeSpentDict.iteritems():
if v[0] == 0:
resDict[k] = 0
else:
resDict[k] = v[1]/v[0]
return resDict
if __name__ == "__main__":
avgTimeSpentDict = {}
if len(sys.argv) < 2:
print("Please provide input data file name for processing")
sys.exit(1)
fileObj = open(sys.argv[1])
number_of_chunks = 4 if len(sys.argv) < 3 else int(sys.argv[2])
defaultTimeOut = 60000 if len(sys.argv) < 4 else int(sys.argv[3])
for chunk_number in range(number_of_chunks):
for chunk in fileBlockStream(fileObj, number_of_chunks, chunk_number):
computeResultDS(chunk, avgTimeSpentDict, defaultTimeOut)
print (computeAvg(avgTimeSpentDict,defaultTimeOut))
avgTimeSpentDict.clear() #Nullify dictionary
fileObj.close #Close the file object
Both program above gives desired output, but efficiency is what matters for this particular scenario. Let me know if you've anything better or any suggestions on existing implementation.
Thanks in Advance!!

What you are after is iterator usage. I'm not going to re-write your code, but the trick here is likely to be using an iterator. Fortunately Scala provides decent out of the box tooling for the job.
import scala.io.Source
object ReadBigFiles {
def read(fileName: String): Unit = {
val lines: Iterator[String] = Source.fromFile(fileName).getLines
// now you get iterator semantics for the file line traversal
// that means you can only go through the lines once, but you don't incur a penalty on heap usage
}
}
For your use case, you seem to require a lastUser, so you're dealing with groups of 2 entries. I think you you have two choices, either go for iterator.sliding(2), which will produce iterators for every pair, or simply add recursion to the mix using options.
def navigate(source: Iterator[String], last: Option[User]): ResultType = {
if (source.hasNext) {
val current = source.next()
last match {
case Some(existing) => // compare with previous user etc
case None => navigate(source, Some(current))
}
} else {
// exit recursion, return result
}
}
You can avoid all the code you've written to read the file and so on. If you need to count occurrences, simply build a Map inside your recursion, and increment the occurrences at every step based on your business logic.

from queue import LifoQueue, Queue
def averageTime() -> float:
logs = {}
records = Queue()
with open("log.txt") as fp:
lines = fp.readlines()
for line in lines:
if line[0] not in logs:
logs[line[0]] = LifoQueue()
logs[line[0]].put((line[1], line[2]))
else:
logs[line[0]].put((line[1], line[2]))
for k in logs:
somme = 0
count = 0
while not logs[k].empty():
l = logs[k].get()
somme = (somme + l[0]) if l[1] == "open" else (somme - l[0])
count = count + 1
records.put([k, somme, count//2])
while not records.empty():
record = records.get()
print(f"UserId={record[0]} Avg={record[1]/record[2]}")

Categories

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

Implementing K-medoids in Pyspark - pyspark

Your best choice is to adapt this Python implementation into Scala so you take advance of RDD partitions and distributed computation. https://github.com/letiantian/kmedoids/blob/master/kmedoids.py

Related

Confused function in python

How to compare the data entered from the program and the data in MongoDB?

Can we give Max & Min values statically in normalization using MinMaxScaler Sklearn?

How make custom dataset for classification task when the class is folder name using Pytorch?

Efficient way to optimise a Scala code to read large file that doesn't fit in memory

Categories

Resources