Need help, i receive a file with new line.
name,age
"Maria",28
"Kevin",30
"Joseph",31
"Faith",20
"Arnel
",21
"Kate",40
How can I identify that line and remove it from the list?
output should be
name,age
"Maria",28
"Kevin",30
"Joseph",31
"Faith",20
"Kate",40
This is one approach
import csv
data = []
with open(filename) as infile:
reader = csv.reader(infile)
for line in reader:
if not line[0].endswith("\n"):
data.append(line)
with open(filename, "w") as outfile:
writer = csv.writer(outfile)
writer.writerows(data)
You can also correct the entry using str.strip().
Ex:
import csv
data = []
with open(filename) as infile:
reader = csv.reader(infile)
for line in reader:
if line[0].endswith("\n"):
line[0] = line[0].strip()
data.append(line)
with open(filename, "w") as outfile:
writer = csv.writer(outfile)
writer.writerows(data)
Related
Having the following binary file (mp3) that send audio to a service in Azure to be trascripted.
The following code works in Databricks.
import os
import requests
url = "https://endpoint_service"
headers = {
'Ocp-Apim-Subscription-Key': 'MyKey',
'Content-Type': 'audio/mpeg'
}
def send_audio_transcript(url, payload, header):
"""Send audio.mp3 to a Azure service to be transcripted to text."""
response = requests.request("POST", url, headers=headers, data=payload)
return response.json()
full_path = <my_path>file.mp3
with open(full_path, mode='rb') as file: # b is important -> binary
fileContent = file.read()
send_audio_transcript(url, fileContent, headers) # a POST request its works
But my audio files are in a sensitive storage in Data lake and the only way to access them is by spark read.
looking for the documentation the way to read a binary file is.
df = spark.read.format("binaryFile").load(full_path)
display(df)
path || modificationTime || length || content
path || sometime || some_lenght || 2dnUwAC
first try:
content = df.content
test_service = send_audio_transcript(url, content , headers)
ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
Second try(convert spark to pandas):
pandas_df = df.toPandas()
content = pandas_df["content"]
test_service = send_audio_transcript(url, content , headers)
Valuerror:ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
What is the exactly translate in python-pyspark to:
with open(full_path, mode='rb') as file: # b is important -> binary
fileContent = file.read()
Your content data comming from Spark is not the same as the content data comming from open file.
From spark and later pandas you have a pandas series but from open the file you will have a class bytes
with open(full_path, mode='rb') as file: # b is important -> binary
fileContent = file.read()
print(type(fileContent)) # will return <class 'bytes'>
but from Spark
input_df = spark.read.format("binaryFile").load(full_path)
pandas_df = input_df.toPandas()
content = pandas_df['content']
print(type(content)) # return <class 'pandas.core.series.Series'>
In your case to fix your problem you need to take just the first element of the series.
content_good = content[0]
print(content_good) # you have your <class 'bytes'> wich is what you need
First, let me assure you I read all the relevant answers and they don't work for me.
I am using multiprocessing Pool to parallelize my data creation. I am using Mongodb 5.0 and pymongo client.
As you can see I am initializing the mongo client in the worker as suggested by the available answers but still I get a :
TypeError: cannot pickle '_thread.lock' object
Exception ignored in: <function CommandCursor.__del__ at 0x7f96f6fff160>
Is there a way I can use multiprocessing with pymongo.Cursor ??
Any help will be appreciated
This is the function that calls the Pool
def get_all_valid_events(
event_criteria:str,
all_listings:List[str],
earnings:List[Dict[str,Any]],
days_around_earnings=0,
debug=False,
poolsize=10,
chunk_size=100,
lookback=30,
lookahead = 0
):
start = time.perf_counter()
listings = Manager().list(all_listings.copy())
valid_events = []
if debug:
for i in range(ceil(len(listings)/chunk_size)):
valid_events += get_valid_event_dates_by_listing(event_criteria,listings[i*chunk_size:(i+1)*chunk_size] , earnings, days_around_earnings,debug)
else:
payload = list()
for i in range(ceil(len(listings)/chunk_size)):
payload.append(
[
event_criteria,
listings[i*chunk_size:(i+1)*chunk_size],
earnings,
days_around_earnings,
debug,
lookback,
lookahead
]
)
with ThreadPool(poolsize) as pool:
valid_events = pool.starmap(get_valid_event_dates_by_listing, payload)
print(f"getting all valid true events took {time.perf_counter() - start} sec")
return valid_events
And this is the worker function:
def get_valid_event_dates_by_listing(
event_criteria:str,
listings:List[str],
earnings_list,
days_around_earnings=0,
debug=False,
lookback=30,
lookahead=0
) -> List[Tuple[Tuple[str, datetime], int]]:
#TODO: generalize event filter
start = time.perf_counter()
client = MongoClient()
db = client['stock_signals']
cursor_candles_by_listing = db.candles.find(
{'listing': {'$in': listings}},
{'_id':0, 'listing':1, 'date':1,'position':1, 'PD_BBANDS_6_lower':1, 'close':1, 'PD_BBANDS_6_upper':1}
)
candles = list(cursor_candles_by_listing)
df = pd.DataFrame(candles).dropna()
minimum_position_dict = dict(df.groupby('listing').min()['position']) # We need the minimum position by listing to filter only events that have lookback
# Filter only the dates that satisfy the criteria
lte_previous_bb_6_lower = df['close'] <= df[f"{event_criteria}_lower"].shift()
gte_previous_bb_6_upper = df['close'] >= df[f"{event_criteria}_upper"].shift()
potential_true_events_df = df[lte_previous_bb_6_lower | gte_previous_bb_6_upper]
potential_false_events_df = df.drop(potential_true_events_df.index)
potential_true_event_dates = potential_true_events_df[['listing', 'date', 'position']].values
actual_true_event_dates = earning_helpers.filter_event_dates_by_earnings_and_position(potential_true_event_dates, earnings_list, minimum_position_dict ,days_around_earning=days_around_earnings, lookback=lookback)
true_event_dates = [((event_date[0], event_date[1], event_date[2]), 1) for event_date in actual_true_event_dates]
potential_false_event_dates = potential_false_events_df[['listing', 'date', 'position']].values
actual_false_event_dates = _random_false_events_from_listing_df(potential_false_event_dates, len(actual_true_event_dates), earnings_list, minimum_position_dict, days_around_earnings,lookback)
false_events_dates = [((event_date[0], event_date[1], event_date[2]), 0) for event_date in actual_false_event_dates]
all_event_dates = true_event_dates + false_events_dates
shuffle(all_event_dates)
print(f"getting a true sequence for listing took {time.perf_counter() - start} sec")
return all_event_dates
And this is my main
from utils import event_helpers, earning_helpers
from utils.queries import get_candle_listing
if __name__ == "__main__":
all_listings = get_candle_listing.get_listings()
earnigns = earning_helpers.get_all_earnings_dates()
res = event_helpers.get_all_valid_events('PD_BBANDS_6', all_listings, earnigns, 2, chunk_size=100)
Full Stack Trace
File "test_multiprocess.py", line 8, in <module>
res = event_helpers.get_all_valid_events('PD_BBANDS_6', all_listings, earnigns, 2, chunk_size=100)
File "/media/data/projects/ml/signal_platform/utils/event_helpers.py", line 53, in get_all_valid_events
valid_events = pool.starmap(get_valid_event_dates_by_listing, payload)
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/pool.py", line 372, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/pool.py", line 771, in get
raise self._value
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/pool.py", line 537, in _handle_tasks
put(task)
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
TypeError: cannot pickle '_thread.lock' object
Exception ignored in: <function CommandCursor.__del__ at 0x7f46e91e21f0>
Traceback (most recent call last):
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/command_cursor.py", line 68, in __del__
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/command_cursor.py", line 83, in __die
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/mongo_client.py", line 1696, in _cleanup_cursor
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/client_session.py", line 466, in _end_session
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/client_session.py", line 871, in in_transaction
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/client_session.py", line 362, in active
AttributeError: 'NoneType' object has no attribute 'STARTING'
Update: 01-23
I tried using the multiprocess library using dill but it didn't help
I'm trying to copy paste an html table received by outlook email to a new excel spreadsheet but I get a "pywintypes.com_error." Seeking a more pythonic way to do a the equivalent of a "Control+A" on an email body and paste to a new spreadsheet.
The relevant pieces of code are:
import win32com.client
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
# Select main Inbox
inbox = outlook.GetDefaultFolder(6)
messages = inbox.Items
try:
for message in messages:
try:
if message.subject == 'myemailed Report':
print('Sender:' , message.sender)
print(message.subject)
mailItem = message.HTMLBody # <----Attempting to copy the body of the selected email.
# Start an instance of Excel
Xlsx = win32com.client.Dispatch("Excel.Application")
# Prevent Excel from asking questions.
Xlsx.DisplayAlerts = True # will change to False
Xlsx.Visible = True # will change to False
# Create a new Excel Workbook
workbook = Xlsx.Workbooks.Add()
ws = workbook.Sheets("Sheet1")
ws.Range('a7').select
ws.Paste(mailItem) # <--------------- Generates Error
workbook.SaveAs(mydesktop+'UpdatedSheet.xlsx')
# Quit Excel
Xlsx.Quit()
except:
x=1
except:
x=1
I get a message: Traceback (most recent call last):
File "", line 1, in
ws.Paste(mailItem)
File ">", line 3, in Paste
pywintypes.com_error: (-2147352567, 'Exception occurred.', (0, 'Microsoft Excel', 'Paste method of Worksheet class failed', 'xlmain11.chm', 0, -2146827284), None)
Is there a better way? Help is appreciated!
ws.Paste(mailItem) is the main part of the problem. Code should be:
ws.Paste()
However - copying the email body as if hitting "Ctrl-A" is a little more involved and there are a lot of almost answers. I managed to get the following work but I don't know why it works.
I used import pyperclip which requires pip install pyperclip, along with the following code:
import pyperclip
def copy(text):
win32clipboard.OpenClipboard()
win32clipboard.EmptyClipboard()
win32clipboard.SetClipboardText(text, win32clipboard.CF_UNICODETEXT)
win32clipboard.CloseClipboard()
def paste():
win32clipboard.OpenClipboard()
data = win32clipboard.GetClipboardData(win32clipboard.CF_UNICODETEXT)
win32clipboard.CloseClipboard()
return data
Then later on...
mailItem = message.HTMLBody
pyperclip.copy(mailItem)
# Start an instance of Excel
Xlsx = win32com.client.Dispatch("Excel.Application")
# Create a new Excel Workbook
workbook = Xlsx.Workbooks.Add()
ws = workbook.Sheets("Sheet1")
ws.Range('a1').select
ws.Paste()
ws.Range('a1').select
workbook.SaveAs(myexcel.xlsx')
I tried doing this without using pyperclip but the combination of the two def's in the beginning and the ws.Paste() worked.
Actually I am developping a POC on which we want an app which has a REST API and discuss with MongoDB in Python.
For this we found several techs, such as Django-rest-framework for the API side and djongo for the ORM side. Nevertheless, I scan lots of tutos on how to implement djongo ORM in DRF, no way there is nothing BUT apparently it's possible, can someone confirm?
My main problem is my POC does absolutely not work, indeed, in used djongo models in my DRF Serializers but it does not work at all, I dont understand, can someone figure it out whats going on?:
models.py:
from djongo import models
class Channel(models.Model):
sourceId = models.IntegerField(default=-1)
usageId = models.IntegerField(default=0)
channelId = models.IntegerField(default=0)
cabinetId = models.IntegerField(default=0)
zoneId = models.IntegerField(default=0)
class Product(models.Model):
dateCreation = models.DateTimeField(auto_now=True)
dateUpdate = models.DateTimeField(auto_now=True)
name = models.CharField(max_length=50, default="Unknown product name")
channels = models.EmbeddedModelField(
model_container=Channel,
)
objects = models.DjongoManager()
views.py:
from django.http import HttpResponse, JsonResponse
from django.views.decorators.csrf import csrf_exempt
from rest_framework.parsers import JSONParser
from Api.models import Product
from Api.serializers import ProductSerializer
#csrf_exempt
def ProductList(aRequest):
"""
#brief List all products, or create a new product.
"""
if aRequest.method == 'GET':
wProducts = Product.objects.all()
wSerializer = ProductSerializer(wProducts, many=True)
return JsonResponse(wSerializer.data, safe=False)
elif aRequest.method == 'POST':
data = JSONParser().parse(aRequest)
wSerializer = ProductSerializer(data=data)
if wSerializer.is_valid():
wSerializer.save()
return JsonResponse(wSerializer.data, status=201)
return JsonResponse(wSerializer.errors, status=400)
#csrf_exempt
def ProductDetail(aRequest, pk):
"""
#brief Retrieve, update or delete a product.
"""
try:
wProducts = Product.objects.get(pk=pk)
except Product.DoesNotExist:
return HttpResponse(status=404)
if aRequest.method == 'GET':
wSerializer = ProductSerializer(wProducts)
return JsonResponse(wSerializer.data)
elif aRequest.method == 'PUT':
data = JSONParser().parse(aRequest)
wSerializer = ProductSerializer(wProducts, data=data)
if wSerializer.is_valid():
wSerializer.save()
return JsonResponse(wSerializer.data)
return JsonResponse(wSerializer.errors, status=400)
elif aRequest.method == 'DELETE':
Product.delete()
return HttpResponse(status=204)
serializers.py:
from rest_framework import serializers
from Api.models import Product, Channel
class ChannelSerializer(serializers.ModelSerializer):
class Meta:
model = Channel
fields = ('sourceId', 'usageId', 'channelId', 'cabinetId', 'zoneId')
def create(self, validated_data):
wChannel = Channel.objects.create(**validated_data)
return wChannel
class ProductSerializer(serializers.ModelSerializer):
channels = ChannelSerializer(many=True)
class Meta:
model = Product
fields = ('dateCreation', 'dateUpdate', 'name', 'channels')
def create(self, validated_data):
wChannels = validated_data.pop("channels")
wProduct = Product.objects.create(**validated_data)
for wChannel in wChannels:
Channel.objects.create(product=wProduct, **wChannel)
return wProduct
When I run my server with this POST request:
{
"dateCreation": "2018-07-20 12:00:00.000",
"dateUpdate": "2018-07-20 12:00:00.000",
"name": "post_test_channel_1",
"channels": [{
"sourceId": -1,
"usageId": 100,
"channelId": 0,
"cabinetId": 0,
"zoneId": 1
}]
}
I obtain that stacktrace:
Internal Server Error: /products/
Traceback (most recent call last):
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/core/handlers/exception.py", line 35, in inner
response = get_response(request)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/core/handlers/base.py", line 128, in _get_response
response = self.process_exception_by_middleware(e, request)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/core/handlers/base.py", line 126, in _get_response
response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/views/decorators/csrf.py", line 54, in wrapped_view
return view_func(*args, **kwargs)
File "/home/soulasb/projects/POC/PocEms/Api/views.py", line 25, in ProductList
wSerializer.save()
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/rest_framework/serializers.py", line 214, in save
self.instance = self.create(validated_data)
File "/home/soulasb/projects/POC/PocEms/Api/serializers.py", line 29, in create
wProduct = Product.objects.create(**validated_data)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/manager.py", line 82, in manager_method
return getattr(self.get_queryset(), name)(*args, **kwargs)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/query.py", line 417, in create
obj.save(force_insert=True, using=self.db)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/base.py", line 729, in save
force_update=force_update, update_fields=update_fields)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/base.py", line 759, in save_base
updated = self._save_table(raw, cls, force_insert, force_update, using, update_fields)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/base.py", line 842, in _save_table
result = self._do_insert(cls._base_manager, using, fields, update_pk, raw)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/base.py", line 880, in _do_insert
using=using, raw=raw)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/manager.py", line 82, in manager_method
return getattr(self.get_queryset(), name)(*args, **kwargs)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/query.py", line 1125, in _insert
return query.get_compiler(using=using).execute_sql(return_id)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/sql/compiler.py", line 1284, in execute_sql
for sql, params in self.as_sql():
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/sql/compiler.py", line 1237, in as_sql
for obj in self.query.objs
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/sql/compiler.py", line 1237, in <listcomp>
for obj in self.query.objs
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/sql/compiler.py", line 1236, in <listcomp>
[self.prepare_value(field, self.pre_save_val(field, obj)) for field in fields]
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/sql/compiler.py", line 1176, in prepare_value
value = field.get_db_prep_save(value, connection=self.connection)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/django/db/models/fields/__init__.py", line 767, in get_db_prep_save
return self.get_db_prep_value(value, connection=connection, prepared=False)
File "/home/soulasb/projects/POC/venv-app/lib/python3.6/site-packages/djongo/models/fields.py", line 461, in get_db_prep_value
model=Model
ValueError: Value: None must be instance of Model: <class 'django.db.models.base.Model'>
This usually happens when you have added an EmbeddedModelField in your model and not passing an object to this field while creating an entry to this model.
I didnt find an option to add an EmbeddedModelField with null=True option.
Hope this helps someone
It could be the ENFORCE_SCHEMA set to true in settings.py. Maybe change it to "ENFORCE_SCHEMA: false".
I'm new to web app and I want to check when there's a new version of dota map, I'll check links in getdota.com.
How can I do this and which language, I want it checks every time you start warcraft, and auto download new map to specific folder.
My question is : Can you give a link to a specific article about web automation or something like that.
Thanks first :)
Below is an example in Python.
It parses getdota.com page, reads parameters for POST request for downloading a map, gets the file and saves it in configured directory (by default current directory).
#!/usr/bin/env python
import urllib
import urllib2
import sgmllib
from pprint import pprint
import os.path
import sys
url = 'http://www.getdota.com/'
download_url = 'http://www.getdota.com/app/getmap/'
chunk = 10000
directory = '' #directory where file should be saved, if empty uses current dir
class DotaParser(sgmllib.SGMLParser):
def parse(self, s):
self.feed(s)
self.close()
def __init__(self, verbose=0):
sgmllib.SGMLParser.__init__(self, verbose)
self.URL = ''
self.post_args = {}
def getArgs(self):
return self.post_args
def start_input(self, attributes):
d = dict(attributes)
if d.get('id', None) == None:
return
if d['id'] in ["input_mirror2", "input_file_name2", "input_map_id2", "input_language2", "input_language_id2"]:
self.post_args[d['name']] = d['value']
if __name__ == '__main__':
dotap = DotaParser()
data = urllib2.urlopen(urllib2.Request('http://www.getdota.com/')).read()
dotap.parse(data)
data = urllib.urlencode(dotap.getArgs())
request = urllib2.Request(download_url, data)
response = urllib2.urlopen(request)
page = response.read()
#download file
fname = directory + page.split('/')[-1]
if os.path.isfile(fname):
print "No newer file available"
sys.exit(0)
f = open(fname, 'w')
print "New file available. Saving in: %s" % fname
webFile = urllib.urlopen(page)
c = webFile.read(chunk)
while(c):
f.write(c)
c = webFile.read(chunk)
f.close()
webFile.close()