using dataloader to interface kafka data - apache-kafka

i use dataloader to inferface the data in kafka and it doesnt work
here is my code
class kfkdataset(Dataset):
def __init__(self,consumer,image_size):
super(kfkdataset).__init__()
self.image_size=image_size
self.consumer = consumer
def __getitem__(self, index):
info = json.loads(next(self.consumer).value)
image_osspath = info['path']
image = prep_image_batch(image_osspath,self.image_size)
return image,image_osspath
def __len__(self):
# You should change 0 to the total size of your dataset.
return 9000000
consumer = KafkaConsumer('my-topic',bootstrap_servers=[])
prodataset = kfkdataset(consumer,image_size=608)#)
k = DataLoader(prodataset,
batch_size=batch_size,
num_workers=16)
for inputimage,osspath in k:
inputimage = inputimage.to(device)
detections,_ = model(inputimage)
detections = non_max_suppression(detections, 0.98, 0.4)
it works when num_workers is 1
when num_workers >1:
errors came out
File "batch_upload.py", line 80, in
for inputimage,osspath in k:
File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 801, in__next__
return self._process_data(data)
File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 846,in_process_data
data.reraise()
File "/usr/local/lib/python3.6/dist-packages/torch/_utils.py", line 369, in reraise
raise self.exc_type(msg)
FileExistsError: Caught FileExistsError in DataLoader worker process 1.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/appbatch/utils/utils.py", line 49, in getitem
info = json.loads(next(self.consumer).value)
File "/usr/local/lib/python3.6/dist-packages/kafka/consumer/group.py", line 1192, in next
return self.next_v2()
File "/usr/local/lib/python3.6/dist-packages/kafka/consumer/group.py", line 1200, in next_v2
return next(self._iterator)
File "/usr/local/lib/python3.6/dist-packages/kafka/consumer/group.py", line 1115, in _message_generator_v2
record_map = self.poll(timeout_ms=timeout_ms, update_offsets=False)
File "/usr/local/lib/python3.6/dist-packages/kafka/consumer/group.py", line 654, in poll
records = self._poll_once(remaining, max_records, update_offsets=update_offsets)
File "/usr/local/lib/python3.6/dist-packages/kafka/consumer/group.py", line 701, in _poll_once
self._client.poll(timeout_ms=timeout_ms)
File "/usr/local/lib/python3.6/dist-packages/kafka/client_async.py", line 600, in poll
self._poll(timeout / 1000)
File "/usr/local/lib/python3.6/dist-packages/kafka/client_async.py", line 629, in _poll
self._register_send_sockets()
File "/usr/local/lib/python3.6/dist-packages/kafka/client_async.py", line 619, in _register_send_sockets
self._selector.modify(key.fileobj, events, key.data)
File "/usr/lib/python3.6/selectors.py", line 261, in modify
key = self.register(fileobj, events, data)
File "/usr/lib/python3.6/selectors.py", line 412, in register
self._epoll.register(key.fd, epoll_events)
FileExistsError: [Errno 17] File exists
i want know how to make it works

Basically, setting num_workers > 1 in PyTorch's DataLoader is creating several worker processes which are in turn biding to the same socket port as there is only one consumer.
One approach to parallelize and improve importing data from Kafka is to create several consumers in the same consumer group for that topic.

Related

psycopg2.errors.UndefinedColumn: column website.sequence does not exist

I got the above error when I restarted the odoo server in docker when a user from our team was making changes to an odoo module. afterwards I was not able to restart odoo.
when i Use \d website the column sequence don't exist in the table.
and when il add the column manualy i have another error for another table.
File "/usr/lib/python3/dist-packages/odoo/tools/cache.py", line 90, in lookup
value = d[key] = self.method(*args, **kwargs)
File "/usr/lib/python3/dist-packages/odoo/addons/website/models/website.py", line 987, in _get_current_website_id
found_websites = self.search([('domain', 'ilike', _remove_port(domain_name))]).sorted('country_group_ids')
File "/usr/lib/python3/dist-packages/odoo/models.py", line 1811, in search
return res if count else self.browse(res)
File "/usr/lib/python3/dist-packages/odoo/models.py", line 5144, in browse
if not ids:
File "/usr/lib/python3/dist-packages/odoo/osv/query.py", line 215, in __bool__
return bool(self._result)
File "/usr/lib/python3/dist-packages/odoo/tools/func.py", line 26, in __get__
value = self.fget(obj)
File "/usr/lib/python3/dist-packages/odoo/osv/query.py", line 208, in _result
self._cr.execute(query_str, params)
File "<decorator-gen-3>", line 2, in execute
File "/usr/lib/python3/dist-packages/odoo/sql_db.py", line 89, in check
return f(self, *args, **kwargs)
File "/usr/lib/python3/dist-packages/odoo/sql_db.py", line 310, in execute
res = self._obj.execute(query, params)
psycopg2.errors.UndefinedColumn: column website.sequence does not exist
LINE 1: ...."domain"::text ilike '%#%') ORDER BY "website"....

How to fix incorrect channel size in pytorch neural network?

I'm working with the Google utterance dataset in spectrogram form. Each data point has dimension (160, 101). In my data loader, I used batch_size=128. Therefore, each batch has dimension (128, 160, 101).
I use a LeNet model with the following code as the model:
class LeNet(nn.Module):
def __init__(self):
super(LeNet, self).__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16*5*5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 30)
def forward(self, x):
out = F.relu(self.conv1(x))
out = F.max_pool2d(out, 2)
out = F.relu(self.conv2(out))
out = F.max_pool2d(out, 2)
out = out.view(out.size(0), -1)
out = F.relu(self.fc1(out))
out = F.relu(self.fc2(out))
out = self.fc3(out)
return out
I tried unsqueezing the data with dim=3, but got this error:
Traceback (most recent call last):
File "train_speech.py", line 359, in <module>
train_loss, reg_loss, train_acc, cost = train(epoch)
File "train_speech.py", line 258, in train
outputs = (net(inputs))['out']
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/torch/nn/parallel/data_parallel.py", line 166, in forward
return self.module(*inputs[0], **kwargs[0])
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/content/gdrive/My Drive/Colab Notebooks/mixup_erm-master/models/lenet.py", line 15, in forward
out = F.relu(self.conv1(x))
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/conv.py", line 443, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/conv.py", line 440, in _conv_forward
self.padding, self.dilation, self.groups)
RuntimeError: Given groups=1, weight of size [6, 1, 5, 5], expected input[128, 160, 101, 1] to have 1 channels, but got 160 channels instead
How do I fix this issue?
EDIT: New Error Message Below
torch.Size([128, 160, 101])
torch.Size([128, 1, 160, 101])
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at /pytorch/c10/core/TensorImpl.h:1156.)
return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
Traceback (most recent call last):
File "train_speech.py", line 363, in <module>
train_loss, reg_loss, train_acc, cost = train(epoch)
File "train_speech.py", line 262, in train
outputs = (net(inputs))['out']
IndexError: too many indices for tensor of dimension 2
I'm unsqueezing the data in each batch. The relevant section of my training code is below. inputs is analogous to x.
print(inputs.shape)
inputs = inputs.unsqueeze(1)
print(inputs.shape)
outputs = (net(inputs))['out']
Edit 2: New Error
Traceback (most recent call last):
File "train_speech.py", line 361, in <module>
train_loss, reg_loss, train_acc, cost = train(epoch)
File "train_speech.py", line 270, in train
loss.backward()
File "/usr/local/lib/python3.7/dist-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py", line 149, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: Function AddmmBackward returned an invalid gradient at index 1 - got [128, 400] but expected shape compatible with [128, 13024]
Edit 3: Train Loop Below
def train(epoch):
print('\nEpoch: %d' % epoch)
net.train()
train_loss = 0
reg_loss = 0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(trainloader):
if use_cuda:
inputs, targets = inputs.cuda(), targets.cuda()
inputs, targets_a, targets_b, lam,layer, cost = mixup_data(inputs, targets,
args.alpha,args.mixupBatch, use_cuda)
inputs, targets_a, targets_b = map(Variable, (inputs,
targets_a, targets_b))
outputs = net(inputs)
loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam)
train_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += targets.size(0)
correct += (lam * predicted.eq(targets_a.data).cpu().sum().float()
+ (1 - lam) * predicted.eq(targets_b.data).cpu().sum().float())
optimizer.zero_grad()
loss.backward()
optimizer.step()
return (train_loss/batch_idx, reg_loss/batch_idx, 100.*correct/total, cost/batch_idx)
You should expand on axis=1 a.k.a. the channel axis:
>>> x = x.unsqueeze(1)
If you're inside the dataset __getitem__, then it corresponds to axis=0.

Read GCS file into dask dataframe

I want to read a csv file stored in Google Cloud Storage using dask dataframe.
I have insalled gcsfs & dask in the conda env. on Windows
import dask.dataframe as dd
import gcsfs
project_id = 'my-project'
token_file = 'C:\\path\to\credentials.json'
fs = gcsfs.GCSFileSystem(project=project_id)
gcs_bucket_name = 'my_bucket'
df = dd.read_csv('gs://'+gcs_bucket_name+'/my_file.csv',storage_options={'token': token_file, 'project': project_id})
I know I'm not providing the key file correctly as per
https://gcsfs.readthedocs.io/en/latest/ but not sure how to do it. Can anyone please help?
Error I'm getting -
File "<ipython-input-XXXXXXXX>", line 1, in <module>
runfile('C:/path/to/scripts/my_python_script.py', wdir='C:/path/to/scripts')
File "C:\Users\AppData\Local\Continuum\anaconda3\envs\my-rdkit-env\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile
execfile(filename, namespace)
File "C:\Users\AppData\Local\Continuum\anaconda3\envs\my-rdkit-env\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/path/to/scripts/my_python_script.py", line 28, in <module>
df = dd.read_csv('gcs://'+gcs_bucket_name+'/my_file.csv',storage_options={'token': token_file, 'project': project_id})
File "C:\Users\AppData\Local\Continuum\anaconda3\envs\my-rdkit-env\lib\site-packages\dask\dataframe\io\csv.py", line 578, in read
**kwargs
File "C:\Users\AppData\Local\Continuum\anaconda3\envs\my-rdkit-env\lib\site-packages\dask\dataframe\io\csv.py", line 444, in read_pandas
head = reader(BytesIO(b_sample), **kwargs)
File "C:\Users\AppData\Local\Continuum\anaconda3\envs\my-rdkit-env\lib\site-packages\pandas\io\parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\AppData\Local\Continuum\anaconda3\envs\my-rdkit-env\lib\site-packages\pandas\io\parsers.py", line 463, in _read
data = parser.read(nrows)
File "C:\Users\AppData\Local\Continuum\anaconda3\envs\my-rdkit-env\lib\site-packages\pandas\io\parsers.py", line 1154, in read
ret = self._engine.read(nrows)
File "C:\Users\AppData\Local\Continuum\anaconda3\envs\my-rdkit-env\lib\site-packages\pandas\io\parsers.py", line 2059, in read
data = self._reader.read(nrows)
File "pandas/_libs/parsers.pyx", line 881, in pandas._libs.parsers.TextReader.read
File "pandas/_libs/parsers.pyx", line 896, in pandas._libs.parsers.TextReader._read_low_memory
File "pandas/_libs/parsers.pyx", line 950, in pandas._libs.parsers.TextReader._read_rows
File "pandas/_libs/parsers.pyx", line 937, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas/_libs/parsers.pyx", line 2132, in pandas._libs.parsers.raise_parser_error
ParserError: Error tokenizing data. C error: Expected 4 fields in line 3, saw 134

"InterfaceError: connection already closed" when using multiprocessing.Pool on black box function that queries PostgreSQL database

I've been given a Python (2.7) function that takes 3 strings as arguments, and returns a list of dictionaries. Due to the nature of the project, I can't alter the function, which is quite complex, calling several other non-standard Python modules and querying a PostgreSQL database using psychopg2. I think that it's the Postgres functionality that's causing me problems.
I want to use the multiprocessing module to speed up calling the function hundreds of times. I've written a "helper" function so that I can use multiprocessing.Pool (which takes only 1 argument) with my function:
from function_script import function
def function_helper(args):
return function(*args)
And my main code looks like this:
from helper_script import function_helper
from multiprocessing import Pool
argument_a = ['a0', 'a1', ..., 'a99']
argument_b = ['b0', 'b1', ..., 'b99']
argument_c = ['c0', 'c1', ..., 'c99']
input = zip(argument_a, argument_b, argument_c)
p = Pool(4)
results = p.map(function_helper, input)
print results
What I'm expecting is a list of lists of dictionaries, however I get the following errors:
Traceback (most recent call last):
File "/local/python/2.7/lib/python2.7/site-packages/variantValidator/variantValidator.py", line 898, in validator
vr.validate(input_parses)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/validator.py", line 33, in validate
return self._ivr.validate(var, strict) and self._evr.validate(var, strict)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/validator.py", line 69, in validate
(res, msg) = self._ref_is_valid(var)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/validator.py", line 89, in _ref_is_valid
var_x = self.vm.c_to_n(var) if var.type == "c" else var
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/variantmapper.py", line 223, in c_to_n
tm = self._fetch_TranscriptMapper(tx_ac=var_c.ac, alt_ac=var_c.ac, alt_aln_method="transcript")
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/decorators/lru_cache.py", line 176, in wrapper
result = user_function(*args, **kwds)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/variantmapper.py", line 372, in _fetch_TranscriptMapper
self.hdp, tx_ac=tx_ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/transcriptmapper.py", line 69, in __init__
self.tx_identity_info = hdp.get_tx_identity_info(self.tx_ac)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/decorators/lru_cache.py", line 176, in wrapper
result = user_function(*args, **kwds)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/dataproviders/uta.py", line 353, in get_tx_identity_info
rows = self._fetchall(self._queries['tx_identity_info'], [tx_ac])
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/dataproviders/uta.py", line 216, in _fetchall
with self._get_cursor() as cur:
File "/local/python/2.7/lib/python2.7/contextlib.py", line 17, in __enter__
return self.gen.next()
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/dataproviders/uta.py", line 529, in _get_cursor
cur.execute("set search_path = " + self.url.schema + ";")
File "/local/python/2.7/lib/python2.7/site-packages/psycopg2/extras.py", line 144, in execute
return super(DictCursor, self).execute(query, vars)
DatabaseError: SSL error: decryption failed or bad record mac
And:
Traceback (most recent call last):
File "/local/python/2.7/lib/python2.7/site-packages/variantValidator/variantValidator.py", line 898, in validator
vr.validate(input_parses)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/validator.py", line 33, in validate
return self._ivr.validate(var, strict) and self._evr.validate(var, strict)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/validator.py", line 69, in validate
(res, msg) = self._ref_is_valid(var)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/validator.py", line 89, in _ref_is_valid
var_x = self.vm.c_to_n(var) if var.type == "c" else var
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/variantmapper.py", line 223, in c_to_n
tm = self._fetch_TranscriptMapper(tx_ac=var_c.ac, alt_ac=var_c.ac, alt_aln_method="transcript")
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/decorators/lru_cache.py", line 176, in wrapper
result = user_function(*args, **kwds)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/variantmapper.py", line 372, in _fetch_TranscriptMapper
self.hdp, tx_ac=tx_ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/transcriptmapper.py", line 69, in __init__
self.tx_identity_info = hdp.get_tx_identity_info(self.tx_ac)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/decorators/lru_cache.py", line 176, in wrapper
result = user_function(*args, **kwds)
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/dataproviders/uta.py", line 353, in get_tx_identity_info
rows = self._fetchall(self._queries['tx_identity_info'], [tx_ac])
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/dataproviders/uta.py", line 216, in _fetchall
with self._get_cursor() as cur:
File "/local/python/2.7/lib/python2.7/contextlib.py", line 17, in __enter__
return self.gen.next()
File "/local/python/2.7/lib/python2.7/site-packages/hgvs/dataproviders/uta.py", line 526, in _get_cursor
conn.autocommit = True
InterfaceError: connection already closed
Does anybody know what might cause the Pool function to behave like this, when it seems so simple to use in other examples that I've tried? If this isn't enough information to go on, can anyone advise me on a way of getting to the bottom of the problem (this is the first time I've worked with someone else's code)? Alternatively, are there any other ways that I could use the multiprocessing module to call the function hundreds of times?
Thanks
I think what may be happening is that your connection object is used across all workers and when 1 worker has completed all its tasks it closes the connection and meanwhile the other workers are still working and the connection is closed so when one of those workers tries to use the db it is already closed.

Loading a pretrained model fails when multiple GPU was used for training

I have trained a network model and saved its weights and architecture via checkpoint = ModelCheckpoint(filepath='weights.hdf5') callback. During training, I am using multiple GPUs by calling the funtion below:
def make_parallel(model, gpu_count):
def get_slice(data, idx, parts):
shape = tf.shape(data)
size = tf.concat([ shape[:1] // parts, shape[1:] ],axis=0)
stride = tf.concat([ shape[:1] // parts, shape[1:]*0 ],axis=0)
start = stride * idx
return tf.slice(data, start, size)
outputs_all = []
for i in range(len(model.outputs)):
outputs_all.append([])
#Place a copy of the model on each GPU, each getting a slice of the batch
for i in range(gpu_count):
with tf.device('/gpu:%d' % i):
with tf.name_scope('tower_%d' % i) as scope:
inputs = []
#Slice each input into a piece for processing on this GPU
for x in model.inputs:
input_shape = tuple(x.get_shape().as_list())[1:]
slice_n = Lambda(get_slice, output_shape=input_shape, arguments={'idx':i,'parts':gpu_count})(x)
inputs.append(slice_n)
outputs = model(inputs)
if not isinstance(outputs, list):
outputs = [outputs]
#Save all the outputs for merging back together later
for l in range(len(outputs)):
outputs_all[l].append(outputs[l])
# merge outputs on CPU
with tf.device('/cpu:0'):
merged = []
for outputs in outputs_all:
merged.append(merge(outputs, mode='concat', concat_axis=0))
return Model(input=model.inputs, output=merged)
With the testing code:
from keras.models import Model, load_model
import numpy as np
import tensorflow as tf
model = load_model('cpm_log/deneme.hdf5')
x_test = np.random.randint(0, 255, (1, 368, 368, 3))
output = model.predict(x = x_test, batch_size=1)
print output[4].shape
I got the error below:
Traceback (most recent call last):
File "cpm_test.py", line 5, in <module>
model = load_model('cpm_log/Jun5_1000/deneme.hdf5')
File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 240, in load_model
model = model_from_config(model_config, custom_objects=custom_objects)
File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 301, in model_from_config
return layer_module.deserialize(config, custom_objects=custom_objects)
File "/usr/local/lib/python2.7/dist-packages/keras/layers/__init__.py", line 46, in deserialize
printable_module_name='layer')
File "/usr/local/lib/python2.7/dist-packages/keras/utils/generic_utils.py", line 140, in deserialize_keras_object
list(custom_objects.items())))
File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 2378, in from_config
process_layer(layer_data)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 2373, in process_layer
layer(input_tensors[0], **kwargs)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 578, in __call__
output = self.call(inputs, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/keras/layers/core.py", line 659, in call
return self.function(inputs, **arguments)
File "/home/muhammed/DEV_LIBS/developments/mocap/pose_estimation/training/cpm/multi_gpu.py", line 12, in get_slice
def get_slice(data, idx, parts):
NameError: global name 'tf' is not defined
By inspecting the error output, I decide that the problem is with the parallelization code. However, I can't resolve the issue.
You may need to use custom_objects to enable loading of the model.
import tensorflow as tf
model = load_model('model.h5', custom_objects={'tf': tf,})