Track progress of members of a chord - celery

Is there a way to get the task_ids of all members of a chord? My goal is to track to the progress of each tasks of the chords. Here is my attempt so far:
import random
import time
from typing import List
import celery
from celery import Celery, chord
from celery.utils.log import get_task_logger
app = Celery(
"chaintasks",
backend="redis://localhost:6379",
broker="pyamqp://guest#localhost//",
)
app.conf.update(task_track_started=True, result_persistent=True)
#app.task(bind=True)
def process_item(self: celery.Task, item: str) -> str:
t = random.randint(0, 10)
for i in range(t):
chord_id = self.request.chord["options"]["task_id"]
self.update_state(
meta={"progress": i / t}, # How do I access this progress value?
)
self.update_state( # This way I write the group_id in the main task info,
task_id=chord_id, # but maybe I don't even need this?
meta={"group_id": self.request.group},
state="STARTED"
)
time.sleep(1)
return item + "_processed"
#app.task()
def group_items(items: List[str]) -> str:
return ":".join(items)
#app.task
def process_and_group(items: List[str]):
task = chord((process_item.s(i) for i in items), group_items.s())
res = task.delay()
return res.id
random.seed(42)
if __name__ == "__main__":
all_items = ["item1", "item2", "item3", "item4"]
res1 = process_and_group.delay(all_items)
while True:
subtask_id = res1.result
if subtask_id is None:
print("initial", res1.state)
else:
res2 = app.AsyncResult(subtask_id)
print("subtask state", res2.state)
info = res2.info
print("subtask info", res2.info)
if isinstance(res2.info, dict):
group_id = info.get("group_id")
if group_id is not None:
group_res = app.GroupResult(group_id)
# Ideally here, I would like to be able to track the progress all the chords tasks,
# something like an average of all progresses would be OK.
print("group results", group_res.results) # group_res.results is None, unfortunately
if res2.state == "SUCCESS":
break
time.sleep(0.5)
print("subtask result", res2.result)
How do I get something out of this self.request.group UUID? Is there a way to track the progress of chord subtasks? I'm open to any modification of this MWE to get it working.

Related

How to send signals/variables between a QDialog and Main Window

I am currently working on a project that involves graphing text file data into a pyqt graph and I have been running into problems with a subclass QDialog box. My goal is to have the QDialog box use a combox to choose between different data sets to graph (The code below shows the "steering angle" setting being chosen). The problem lies with how to make it so that when the Create Graph button is pressed (Found in the QDialog Class), it runs the createGraph(self): function in the main class. I dont know how to work classes that well so I dont know how to make this work.
If anyone has any pointers on either how to get this working, how to properly structure a PYQT Program or how to make it more efficient, I'm all ears.
Thank you for your time!
Main Window Code:
class MainWidget(QMainWindow):
def __init__(self, parent=None):
super(MainWidget, self).__init__(parent)
self.activateWindow()
self.raise_()
self.setupGraph()
self.dockcheck = 0
self.graphcheck = 0
self.setWindowTitle("Drag and Drop Test")
self.resize(1200, 800)
self.setAcceptDrops(True)
self.LBLDragAndDrop = QLabel("Drag And Drop Files Here")
self.LBLDragAndDrop.setAlignment(Qt.AlignHCenter | Qt.AlignVCenter)
if self.graphcheck == 0:
self.setCentralWidget(self.LBLDragAndDrop)
self.path3 = "C:\\Users\\steph\\OneDrive\\Documents\\SAA Wing\\Coding\\Can Bus Data Reading\\Temporary Saves"
self.treeview = QTreeView()
self.treeview.setAnimated(True)
self.fileModel = QFileSystemModel()
self.fileModel.setRootPath(self.path3)
self.indexRoot = self.fileModel.index(self.fileModel.rootPath())
self.treeview.setModel(self.fileModel)
self.treeview.setRootIndex(self.fileModel.index(self.path3))
self.treeview.setColumnWidth(0, 250)
self.treeview.doubleClicked.connect(self.onSelectionChanged)
#self.treeview.doubleClicked.connect(self.openDialog)
####################################################################################################################
# Dialog Box
####################################################################################################################
def onSelectionChanged(self, index):
self.selectionPath = self.sender().model().filePath(index)
self.selectionFilename = (self.selectionPath.split("/")[-1])
IDList = ("ID 00d0","ID 00d1","ID 00d3","ID 00d4","ID 0140","ID 0141","ID 0360","ID 0361")
if self.selectionFilename in IDList:
if self.selectionFilename == "ID 00d0":
editDialog = Dialog00d0()
editDialog.exec_()
####################################################################################################################
# Graphing data
####################################################################################################################
def createGraph(self):
self.graphcheck = 1
if self.graphcheck == 1:
self.setCentralWidget(self.scroll_area)
################################################################################################################
# ID 00D0 Creating Graph
################################################################################################################
if self.selectionFilename == "ID 00d0":
self.df00d0 = pd.read_table(self.selectionPath, header=None , delim_whitespace=True, dtype=object)
self.df00d0.columns = ['Timestamp','ID',"B0","B1","B2","B3","B4","B5","B6","B7"]
self.df00d0.dropna(inplace=True)
self.SA = np.array([], dtype=float)
self.LatAcc = np.array([], dtype=float)
self.LonAcc = np.array([], dtype=float)
self.ComAcc = np.array([], dtype=float)
self.Time00d0 = np.array([], dtype=float)
self.Timestamp00d0 = np.array([], dtype=float)
############################################################################################################
# Getting Time Stamps
############################################################################################################
for item in self.df00d0['Timestamp']:
self.Time00d0 = np.append(self.Time00d0, datetime.fromtimestamp(float(item)).strftime("%H:%M:%S.%f")[:-4])
self.Timestamp00d0 = np.append(self.Timestamp00d0, float(item))
############################################################################################################
# Steering Angle Graph
############################################################################################################
if self.combobox00d0.currentText() == "Steering Angle":
SA_ = (((self.df00d0['B1']) + (self.df00d0['B0'])).apply(int, base=16) * 0.1)
for item in SA_:
if item > 6000:
self.SA = np.append(self.SA, round((item - 6553.6), 1))
else:
self.SA = np.append(self.SA, round(item))
y_value = self.SA
Here is the QDialog Box class code:
class Dialog00d0(QDialog):
def __init__(self):
super().__init__()
self.layout = QVBoxLayout()
hlay = QHBoxLayout()
self.setLayout(self.layout)
self.setWindowTitle("Create Graph")
label = QLabel("Data Type")
self.combobox00d0 = QComboBox()
self.combobox00d0.addItem("Steering Angle")
self.combobox00d0.addItem("Latitudinal Acceleration")
self.combobox00d0.addItem("Longitudinal Acceleration")
self.combobox00d0.addItem("Combined Acceleration")
self.BTNCreateGraph = QPushButton("Create Graph")
self.BTNCancel = QPushButton("Cancel")
hlay.addWidget(self.BTNCreateGraph)
hlay.addWidget(self.BTNCancel)
self.layout.addWidget(label)
self.layout.addWidget(self.combobox00d0)
self.layout.addLayout(hlay)
self.BTNCreateGraph.clicked.connect("I need the self.creatGraph here")
self.BTNCancel.clicked.connect("self.close")
I imagine this will help you.
The pyqtSignal() argument tells you what information you want to carry.
In this case, I'm passing a text.
Good luck, I hope I helped.
import sys
from PyQt5.QtWidgets import QMainWindow, QDialog, QApplication
from PyQt5.QtWidgets import QPushButton, QVBoxLayout
from PyQt5 import QtCore, QtGui
class MainWidget(QMainWindow):
def __init__(self, parent=None):
super(MainWidget, self).__init__(parent)
button = QPushButton("Button to open dialog")
button.clicked.connect(self.button_clicked)
self.setCentralWidget(button)
self.show()
def button_clicked(self):
dlg = Dialog00d0()
dlg.signEmit.connect(self.createGraph)
dlg.exec()
def createGraph(self, _str):
print('Now Im here')
print(_str)
class Dialog00d0(QDialog):
signEmit = QtCore.pyqtSignal(str)
def __init__(self):
super().__init__()
self.layout = QVBoxLayout()
self.BTNCreateGraph = QPushButton("link to createGraph()")
self.layout.addWidget(self.BTNCreateGraph)
self.setLayout(self.layout)
self.BTNCreateGraph.clicked.connect(self.BTNCreateGraph_clicked)
def BTNCreateGraph_clicked(self):
self.signEmit.emit('But I passed by')
app = QApplication(sys.argv)
win = MainWidget()
app.exec()

Error when reloading new Plugin code in QGIS

I am nooby to Plugin Development but I'm trying to create a plugin in QGIS, for a Uni-subject, with its own graphical interface, which will receive a zipcode from the user and return the name of the corresponding location.
I already created the plugin skeleton, through the Plugin Builder and have designed the graphical interface with QtDesigner https://i.stack.imgur.com/h6k6Q.png . I also added the .txt file that contains the zipcodes database to the plugin folder as a resource.
From what I understand, the file to edit is the one that ends in dialog.py, through the init() method, in order to establish the connections between the signals emitted by the elements of the graphical interface and the corresponding callbacks.
However, when I change the code in the dialog.py and reload the plugin, it gives me an error, and when starting the QGIS it pop-ups an error message, and the plugin no longer appears. Error message after plugin reload
Could you give me some guidance here and maybe point me to where the problem could be? Thanks
The code is this one:
import os
import sys
import qgis.core
from qgis.PyQt import uic
from qgis.PyQt import (
QtCore,
QtWidgets
)
import geocoder
sys.path.append(os.path.dirname(__file__))
FORM_CLASS, _ = uic.loadUiType(
os.path.join(
os.path.dirname(__file__),
"example_dialog_base.ui"
),
resource_suffix=""
)
class ExampleDialog(QtWidgets.QDialog, FORM_CLASS):
POSTAL_CODES_PATH = ":/plugins/example/todos_cp.txt"
def __init__(self, parent=None):
"""Constructor."""
super(ExampleDialog, self).__init__(parent)
self.setupUi(self)
# connect signals
self.postal_code_le.textChanged.connect(self.toggle_find_button)
self.find_code_btn.clicked.connect(self.execute)
# set initial state
self.find_code_btn.setEnabled(False)
def toggle_find_button(self):
if self.postal_code_le.text() == "":
self.find_code_btn.setEnabled(False)
else:
self.find_code_btn.setEnabled(True)
def execute(self):
self.address_te.clear()
try:
raw_postal_code = self.postal_code_le.text()
main_code, extension = validate_postal_code(raw_postal_code)
record = self.find_record(main_code, extension)
place_name = record[3]
self.address_te.setPlainText(place_name)
if self.create_layer_chb.isChecked():
self.handle_layer_creation(record)
except (ValueError, RuntimeError) as err:
self.show_error(str(err))
def find_record(self, main_code, extension):
file_handler = QtCore.QFile(self.POSTAL_CODES_PATH)
file_handler.open(QtCore.QIODevice.ReadOnly)
stream = QtCore.QTextStream(file_handler)
while not stream.atEnd():
line = stream.readLine()
info = line.split(";")
code1 = info[-3]
code2 = info[-2]
if code1 == main_code and code2 == extension:
result = info
break
else:
raise RuntimeError("Sem resultados")
return result
def handle_layer_creation(self, record):
place_name = record[3]
point = geocode_place_name(place_name)
print("lon: {} - lat: {}".format(point.x(), point.y()))
layer = create_point_layer(
point,
f"found_location_for_{record[-3]}_{record[-2]}",
place_name
)
current_project = qgis.core.QgsProject.instance()
current_project.addMapLayer(layer)
def show_error(self, message):
message_bar = self.iface.messageBar()
message_bar.pushMessage("Error", message, level=message_bar.Critical)
def validate_postal_code(raw_postal_code):
code1, code2 = raw_postal_code.partition("-")[::2]
if code1 == "" or code2 == "":
raise ValueError(
"Incorrect postal code: {!r}".format(raw_postal_code))
return code1, code2
def geocode_place_name(place_name):
geocoder_object = geocoder.osm(place_name)
lon = geocoder_object.json.get("lng")
lat = geocoder_object.json.get("lat")
if lat is None or lon is None:
raise RuntimeError(
"Could not retrieve lon/lat for "
"place: {!r}".format(place_name)
)
point = qgis.core.QgsPointXY(lon, lat)
return point
def create_point_layer(point, layer_name, place_name):
layer = qgis.core.QgsVectorLayer(
"Point?crs=epsg:4326&field=address:string(100)",
layer_name,
"memory"
)
provider = layer.dataProvider()
geometry = qgis.core.QgsGeometry.fromPointXY(point)
feature = qgis.core.QgsFeature()
feature.setGeometry(geometry)
feature.setAttributes([place_name])
provider.addFeatures([feature])
layer.updateExtents()
return layer

LCD to OLED conversion

I am new to coding. I have this project that I want to get done, so this is daunting. I had someone else write this, but my LCD was DOA, so I want to switch to an OLED. Can anyone help me transcribe this code from LCD display to OLED? I made some progress, I think, but I'm not entirely sure what to do next. I've setup the header to import the SSD1306 python file for the OLED and changed the options for the I2C to the settings for the OLED display. I've set the pins and I've changed what happens when I2C runs to what I think is appropriate for this OLED display, but I'm not sure. And later on there are some codes that control what is written to the display, but this is where I am out of my depth. I don't know what to change at the lcd_print stuff to make it do the same thing, but on an OLED...
from machine import I2C, Pin
import ssd1306
import utime
import machine
import sys
relay_pin = 6
relay = Pin(relay_pin, Pin.OUT)
i2c = I2C(0, sda=Pin(0), scl=Pin(1))
def lcd_print(keyword):
if keyword is not str:
keyword=str(keyword)
I2C_ADDR = 0x3c
i2c = I2C(0, sda=Pin(0), scl=Pin(1))
lcd = ssd1306.SSD1306_I2C(128, 32, i2c)
lcd.text(keyword)
lcd.show()
#keypad
key_map=[["D","#","0","*"],\
["C","9","8","7"],\
["B","6","5","4"],\
["A","3","2","1"]]
col_list=[2,3,4,5]
row_list=[8,9,10,11]
for x in range(0,4):
row_list[x]=Pin(row_list[x], Pin.OUT)
row_list[x].value(1)
for x in range(0,4):
col_list[x] = Pin(col_list[x], Pin.IN, Pin.PULL_UP)
def Keypad4x4Read(cols,rows):
for r in rows:
r.value(0)
result=[cols[0].value(),cols[1].value(),cols[2].value(),cols[3].value()]
if min(result)==0:
key=key_map[int(rows.index(r))][int(result.index(0))]
r.value(1) # manages key keept pressed
return(key)
r.value(1)
def send_pulse(n:int):
for i in range(n):
relay(1)
utime.sleep(0.005)
relay(0)
utime.sleep(0.015)
def save2file(data_:str):
f = open("data.txt", "w")
f.write(data_)
f.close()
def read_from_file():
f = open("data.txt","r")
data_ = f.read()
f.close()
return data_
def check_int(string_:str): #Checks if String content is a numeric value
if string_ is not None:
numerics="0123456789"
for i in string_:
if not i in numerics:
return False
return True
try:
val = read_from_file()
except:
val = "00000"
def get_order():
temp=""
def fill_zeros(string_input:str, max_len:int):
str_len = len(string_input)
zeros = "0" * (max_len-str_len)
return zeros
global val
lcd_print(fill_zeros(val,5)+val[0:-1]+"."+val[-1]+"\n")
while True:
key = Keypad4x4Read(col_list,row_list)
if key != None and key != '#':
temp=temp+key
utime.sleep(0.3)
lcd_print("\n"+temp)
if key == "#" and temp != "":
if temp[0]=='A' and temp[1:]==val and check_int(temp[1:]):
pulse_times=100000-int(temp[1:])
lcd_print("R"+fill_zeros(temp,5)+val[0:-1]+"."+val[-1]+"\n")
send_pulse(pulse_times)
val ="0000.0"
lcd_print("0000.0")
save2file(val)
temp=""
elif check_int(temp):
if int(temp)>99999:
lcd_print("\nOUT OF RANGE") #The entered value is out of range
temp=""
utime.sleep(1)
lcd_print(val[0:-1]+"."+val[-1]+"\n")
continue
val = temp
lcd_print(fill_zeros(temp,5)+val[0:-1]+"."+val[-1]+"\nSENDING PULSE")
send_pulse(int(val))
save2file(val)
lcd_print(fill_zeros(temp,5)+val[0:-1]+"."+val[-1])
temp=""
elif temp == "B":
save2file(val)
lcd_print("DATA SAVED...\nSYSTEM PAUSED...")
temp=""
elif temp == "C":
val = "00000"
save2file(val)
temp=""
lcd_print("...RESET...")
sys.exit()
else:
lcd_print("UNDEFINED \nCOMMAND...")
temp=""
utime.sleep(1)
lcd_print(fill_zeros(temp,5)+val[0:-1]+"."+val[-1])
get_order()

How to set goals for Locust?

We have a Locust load/performance test running (in a docker-compose setup).
It now runs on our build-server.
Ideally we would like the build-job to fail if certain requirements are not met.
For example require a certain average response time, or a minimum number of requests within a given timeout.
The expections/requirements must be compared with the aggregated data. So not in the individual (python) test methods.
One option can be to parse the generated reports, but I imagine locust has built-in support for the feature I'm thinking of.
Have a look at locust-plugins, specifically the custom command line options like --check-rps, --check-fail-ratio and --check-avg-response-time
https://github.com/SvenskaSpel/locust-plugins/blob/master/examples/cmd_line_examples.sh
For the purpose of automated KPI validation, you could create a custom plugin using locust’s event hooks and its inner statistics. The overall plugin design is pretty simple:
Register quitting event
Get all statistics and serialize them
Calculate missing metrics (RPS, percentiles, …)
Check provided KPI definition
Validate provided KPI agains actual measurements
The whole KPI plugin code looks like this:
import logging
from enum import Enum
from typing import List
import locust.env
from locust.stats import calculate_response_time_percentile
class Metric(Enum):
EROR_RATE = 'error_rate'
PERCENTILE_90 = 'percentile_90'
RPS = 'rps'
#staticmethod
def has_value(item):
return item in [v.value for v in Metric.__members__.values()]
class KpiPluigin:
def __init__(
self,
env: locust.env.Environment,
kpis: List,
):
self.env = env
self.kpis = kpis
self.errors = []
self._validate_kpis()
events = self.env.events
events.quitting.add_listener(self.quitting) # pyre-ignore
def quitting(self, environment):
serialized_stats = self.serialize_stats(self.env.stats)
updated_stats = self._update_data(serialized_stats)
self._kpi_check(updated_stats)
self._interpret_errors()
def serialize_stats(self, stats):
return [stats.entries[key].serialize() for key in stats.entries.keys() if
not (stats.entries[key].num_requests == 0 and stats.entries[key].num_failures == 0)]
def _update_data(self, stats):
for stat in stats:
stat['error_rate'] = self._calculate_fail_rate(stat)
stat['percentile_90'] = self._calculate_percentile(stat, 0.90)
stat['rps'] = self._calculate_rps(stat)
return stats
def _calculate_rps(self, stat):
rps = stat['num_reqs_per_sec']
num_of_measurements = len(rps)
return sum(rps.values()) / num_of_measurements
def _calculate_fail_rate(self, stat):
num_failures = stat['num_failures']
num_requests = stat["num_requests"]
return (num_failures / num_requests) * 100
def _calculate_percentile(self, stat, percentile):
response_times = stat['response_times']
num_requests = stat['num_requests']
return calculate_response_time_percentile(response_times, num_requests, percentile)
def _kpi_check(self, stats):
if len(stats) == 0:
return
for kpi in self.kpis:
name = list(kpi.keys())[0]
stat = next(stat for stat in stats if stat["name"] == name)
if stat:
kpi_settings = kpi[list(kpi.keys())[0]]
for kpi_setting in kpi_settings:
self._metrics_check(kpi_setting, stat)
def _metrics_check(self, kpi_setting, stat):
(metric, value) = kpi_setting
name = stat["name"]
if metric == Metric.EROR_RATE.value:
error_rate = stat['error_rate']
error_rate <= value or self._log_error(error_rate, kpi_setting, name)
if metric == Metric.PERCENTILE_90.value:
percentile = stat['percentile_90']
percentile <= value or self._log_error(percentile, kpi_setting, name)
if metric == Metric.RPS.value:
rps = stat['rps']
rps >= value or self._log_error(rps, kpi_setting, name)
def _log_error(self, stat_value, kpi_settings, name):
(metric, value) = kpi_settings
self.errors.append(
f"{metric} for '{name}' is {stat_value}, but expected it to be better than {value}") # noqa: E501
def _interpret_errors(self):
if len(self.errors) == 0:
logging.info('All KPIs are good!')
else:
for error in self.errors:
logging.error(f"SLA failed: \n {error}")
self.env.process_exit_code = 1
def _validate_kpis(self):
for kpi in self.kpis:
kpi_keys = list(kpi.keys())
if len(kpi_keys) > 1:
raise Exception("Every dict must contain definition for only one endpoint")
kpi_settings = kpi[kpi_keys[0]]
if len(kpi_settings) == 0:
raise Exception(f"No KPI defined for endpoint {kpi_keys[0]}")
for kpi_setting in kpi_settings:
(metric, value) = kpi_setting
if not isinstance(value, (int, float)):
raise Exception(f"Provide valid value for '{metric}' metric for endpoint {kpi_keys[0]}")
if not Metric.has_value(metric):
raise Exception(f"Metric {metric} not implemented")
Now you have to register KpiPlugin class within your Locust script and define KPI(s) like this:
events.init.add_listener
def on_locust_init(environment, **_kwargs):
KPI_SETTINGS = [{'/store/inventory': [('percentile_90', 50), ('rps', 500), ('error_rate', 0)]}]
KpiPlugin(env=environment, kpis=KPI_SETTINGS)
The above script will make your build fail in case /store/inventory endpoint won't meet one of the defined criteria — 90 percentile is worse than 50ms, RPS is under 500, the error rate is higher than 0%.

Efficient way to optimise a Scala code to read large file that doesn't fit in memory

Problem Statement Below,
We have a large log file which stores user interactions with an application. The entries in the log file follow the following schema: {userId, timestamp, actionType} where actionType is one of two possible values: [open, close]
Constraints:
The log file is too big to fit in memory on one machine. Also assume that the aggregated data doesn’t fit into memory.
Code has to be able to run on a single machine.
Should not use an out-of-the box implementation of mapreduce or 3rd party database; don’t assume we have a Hadoop or Spark or other distributed computing framework.
There can be multiple entries of each actionType for each user, and there might be missing entries in the log file. So a user might be missing a close record between two open records or vice versa.
Timestamps will come in strictly ascending order.
For this problem, we need to implement a class/classes that computes the average time spent by each user between open and close. Keep in mind that there are missing entries for some users, so we will have to make a choice about how to handle these entries when making our calculations. Code should follow a consistent policy with regards to how we make that choice.
The desired output for the solution should be [{userId, timeSpent},….] for all the users in the log file.
Sample log file (comma-separated, text file)
1,1435456566,open
2,1435457643,open
3,1435458912,open
1,1435459567,close
4,1435460345,open
1,1435461234,open
2,1435462567,close
1,1435463456,open
3,1435464398,close
4,1435465122,close
1,1435466775,close
Approach
Below is the code I've written in Python & Scala, which seems to be not efficient and upto the expectations of the scenario given, I'd like to feedback from community of developers in this forum how better we could optimise this code as per given scenario.
Scala implementation
import java.io.FileInputStream
import java.util.{Scanner, Map, LinkedList}
import java.lang.Long
import scala.collection.mutable
object UserMetrics extends App {
if (args.length == 0) {
println("Please provide input data file name for processing")
}
val userMetrics = new UserMetrics()
userMetrics.readInputFile(args(0),if (args.length == 1) 600000 else args(1).toInt)
}
case class UserInfo(userId: Integer, prevTimeStamp: Long, prevStatus: String, timeSpent: Long, occurence: Integer)
class UserMetrics {
val usermap = mutable.Map[Integer, LinkedList[UserInfo]]()
def readInputFile(stArr:String, timeOut: Int) {
var inputStream: FileInputStream = null
var sc: Scanner = null
try {
inputStream = new FileInputStream(stArr);
sc = new Scanner(inputStream, "UTF-8");
while (sc.hasNextLine()) {
val line: String = sc.nextLine();
processInput(line, timeOut)
}
for ((key: Integer, userLs: LinkedList[UserInfo]) <- usermap) {
val userInfo:UserInfo = userLs.get(0)
val timespent = if (userInfo.occurence>0) userInfo.timeSpent/userInfo.occurence else 0
println("{" + key +","+timespent + "}")
}
if (sc.ioException() != null) {
throw sc.ioException();
}
} finally {
if (inputStream != null) {
inputStream.close();
}
if (sc != null) {
sc.close();
}
}
}
def processInput(line: String, timeOut: Int) {
val strSp = line.split(",")
val userId: Integer = Integer.parseInt(strSp(0))
val curTimeStamp = Long.parseLong(strSp(1))
val status = strSp(2)
val uInfo: UserInfo = UserInfo(userId, curTimeStamp, status, 0, 0)
val emptyUserInfo: LinkedList[UserInfo] = new LinkedList[UserInfo]()
val lsUserInfo: LinkedList[UserInfo] = usermap.getOrElse(userId, emptyUserInfo)
if (lsUserInfo != null && lsUserInfo.size() > 0) {
val lastUserInfo: UserInfo = lsUserInfo.get(lsUserInfo.size() - 1)
val prevTimeStamp: Long = lastUserInfo.prevTimeStamp
val prevStatus: String = lastUserInfo.prevStatus
if (prevStatus.equals("open")) {
if (status.equals(lastUserInfo.prevStatus)) {
val timeSelector = if ((curTimeStamp - prevTimeStamp) > timeOut) timeOut else curTimeStamp - prevTimeStamp
val timeDiff = lastUserInfo.timeSpent + timeSelector
lsUserInfo.remove()
lsUserInfo.add(UserInfo(userId, curTimeStamp, status, timeDiff, lastUserInfo.occurence + 1))
} else if(!status.equals(lastUserInfo.prevStatus)){
val timeDiff = lastUserInfo.timeSpent + curTimeStamp - prevTimeStamp
lsUserInfo.remove()
lsUserInfo.add(UserInfo(userId, curTimeStamp, status, timeDiff, lastUserInfo.occurence + 1))
}
} else if(prevStatus.equals("close")) {
if (status.equals(lastUserInfo.prevStatus)) {
lsUserInfo.remove()
val timeSelector = if ((curTimeStamp - prevTimeStamp) > timeOut) timeOut else curTimeStamp - prevTimeStamp
lsUserInfo.add(UserInfo(userId, curTimeStamp, status, lastUserInfo.timeSpent + timeSelector, lastUserInfo.occurence+1))
}else if(!status.equals(lastUserInfo.prevStatus))
{
lsUserInfo.remove()
lsUserInfo.add(UserInfo(userId, curTimeStamp, status, lastUserInfo.timeSpent, lastUserInfo.occurence))
}
}
}else if(lsUserInfo.size()==0){
lsUserInfo.add(uInfo)
}
usermap.put(userId, lsUserInfo)
}
}
Python Implementation
import sys
def fileBlockStream(fp, number_of_blocks, block):
#A generator that splits a file into blocks and iterates over the lines of one of the blocks.
assert 0 <= block and block < number_of_blocks #Assertions to validate number of blocks given
assert 0 < number_of_blocks
fp.seek(0,2) #seek to end of file to compute block size
file_size = fp.tell()
ini = file_size * block / number_of_blocks #compute start & end point of file block
end = file_size * (1 + block) / number_of_blocks
if ini <= 0:
fp.seek(0)
else:
fp.seek(ini-1)
fp.readline()
while fp.tell() < end:
yield fp.readline() #iterate over lines of the particular chunk or block
def computeResultDS(chunk,avgTimeSpentDict,defaultTimeOut):
countPos,totTmPos,openTmPos,closeTmPos,nextEventPos = 0,1,2,3,4
for rows in chunk.splitlines():
if len(rows.split(",")) != 3:
continue
userKeyID = rows.split(",")[0]
try:
curTimeStamp = int(rows.split(",")[1])
except ValueError:
print("Invalid Timestamp for ID:" + str(userKeyID))
continue
curEvent = rows.split(",")[2]
if userKeyID in avgTimeSpentDict.keys() and avgTimeSpentDict[userKeyID][nextEventPos]==1 and curEvent == "close":
#Check if already existing userID with expected Close event 0 - Open; 1 - Close
#Array value within dictionary stores [No. of pair events, total time spent (Close tm-Open tm), Last Open Tm, Last Close Tm, Next expected Event]
curTotalTime = curTimeStamp - avgTimeSpentDict[userKeyID][openTmPos]
totalTime = curTotalTime + avgTimeSpentDict[userKeyID][totTmPos]
eventCount = avgTimeSpentDict[userKeyID][countPos] + 1
avgTimeSpentDict[userKeyID][countPos] = eventCount
avgTimeSpentDict[userKeyID][totTmPos] = totalTime
avgTimeSpentDict[userKeyID][closeTmPos] = curTimeStamp
avgTimeSpentDict[userKeyID][nextEventPos] = 0 #Change next expected event to Open
elif userKeyID in avgTimeSpentDict.keys() and avgTimeSpentDict[userKeyID][nextEventPos]==0 and curEvent == "open":
avgTimeSpentDict[userKeyID][openTmPos] = curTimeStamp
avgTimeSpentDict[userKeyID][nextEventPos] = 1 #Change next expected event to Close
elif userKeyID in avgTimeSpentDict.keys() and avgTimeSpentDict[userKeyID][nextEventPos]==1 and curEvent == "open":
curTotalTime,closeTime = missingHandler(defaultTimeOut,avgTimeSpentDict[userKeyID][openTmPos],curTimeStamp)
totalTime = curTotalTime + avgTimeSpentDict[userKeyID][totTmPos]
avgTimeSpentDict[userKeyID][totTmPos]=totalTime
avgTimeSpentDict[userKeyID][closeTmPos]=closeTime
avgTimeSpentDict[userKeyID][openTmPos]=curTimeStamp
eventCount = avgTimeSpentDict[userKeyID][countPos] + 1
avgTimeSpentDict[userKeyID][countPos] = eventCount
elif userKeyID in avgTimeSpentDict.keys() and avgTimeSpentDict[userKeyID][nextEventPos]==0 and curEvent == "close":
curTotalTime,openTime = missingHandler(defaultTimeOut,avgTimeSpentDict[userKeyID][closeTmPos],curTimeStamp)
totalTime = curTotalTime + avgTimeSpentDict[userKeyID][totTmPos]
avgTimeSpentDict[userKeyID][totTmPos]=totalTime
avgTimeSpentDict[userKeyID][openTmPos]=openTime
eventCount = avgTimeSpentDict[userKeyID][countPos] + 1
avgTimeSpentDict[userKeyID][countPos] = eventCount
elif curEvent == "open":
#Initialize userid with Open event
avgTimeSpentDict[userKeyID] = [0,0,curTimeStamp,0,1]
elif curEvent == "close":
#Initialize userid with missing handler function since there is no Open event for this User
totaltime,OpenTime = missingHandler(defaultTimeOut,0,curTimeStamp)
avgTimeSpentDict[userKeyID] = [1,totaltime,OpenTime,curTimeStamp,0]
def missingHandler(defaultTimeOut,curTimeVal,lastTimeVal):
if lastTimeVal - curTimeVal > defaultTimeOut:
return defaultTimeOut,curTimeVal
else:
return lastTimeVal - curTimeVal,curTimeVal
def computeAvg(avgTimeSpentDict,defaultTimeOut):
resDict = {}
for k,v in avgTimeSpentDict.iteritems():
if v[0] == 0:
resDict[k] = 0
else:
resDict[k] = v[1]/v[0]
return resDict
if __name__ == "__main__":
avgTimeSpentDict = {}
if len(sys.argv) < 2:
print("Please provide input data file name for processing")
sys.exit(1)
fileObj = open(sys.argv[1])
number_of_chunks = 4 if len(sys.argv) < 3 else int(sys.argv[2])
defaultTimeOut = 60000 if len(sys.argv) < 4 else int(sys.argv[3])
for chunk_number in range(number_of_chunks):
for chunk in fileBlockStream(fileObj, number_of_chunks, chunk_number):
computeResultDS(chunk, avgTimeSpentDict, defaultTimeOut)
print (computeAvg(avgTimeSpentDict,defaultTimeOut))
avgTimeSpentDict.clear() #Nullify dictionary
fileObj.close #Close the file object
Both program above gives desired output, but efficiency is what matters for this particular scenario. Let me know if you've anything better or any suggestions on existing implementation.
Thanks in Advance!!
What you are after is iterator usage. I'm not going to re-write your code, but the trick here is likely to be using an iterator. Fortunately Scala provides decent out of the box tooling for the job.
import scala.io.Source
object ReadBigFiles {
def read(fileName: String): Unit = {
val lines: Iterator[String] = Source.fromFile(fileName).getLines
// now you get iterator semantics for the file line traversal
// that means you can only go through the lines once, but you don't incur a penalty on heap usage
}
}
For your use case, you seem to require a lastUser, so you're dealing with groups of 2 entries. I think you you have two choices, either go for iterator.sliding(2), which will produce iterators for every pair, or simply add recursion to the mix using options.
def navigate(source: Iterator[String], last: Option[User]): ResultType = {
if (source.hasNext) {
val current = source.next()
last match {
case Some(existing) => // compare with previous user etc
case None => navigate(source, Some(current))
}
} else {
// exit recursion, return result
}
}
You can avoid all the code you've written to read the file and so on. If you need to count occurrences, simply build a Map inside your recursion, and increment the occurrences at every step based on your business logic.
from queue import LifoQueue, Queue
def averageTime() -> float:
logs = {}
records = Queue()
with open("log.txt") as fp:
lines = fp.readlines()
for line in lines:
if line[0] not in logs:
logs[line[0]] = LifoQueue()
logs[line[0]].put((line[1], line[2]))
else:
logs[line[0]].put((line[1], line[2]))
for k in logs:
somme = 0
count = 0
while not logs[k].empty():
l = logs[k].get()
somme = (somme + l[0]) if l[1] == "open" else (somme - l[0])
count = count + 1
records.put([k, somme, count//2])
while not records.empty():
record = records.get()
print(f"UserId={record[0]} Avg={record[1]/record[2]}")