performance loop while get data from gridFS - mongodb

I'm using pymongo to get the data from gridFS, the loop while getting this data is really slow.
Is it possible to avoid that loop, or is any way to do that faster??
from pymongo import MongoClient
from pprint import pprint
import bson
from gridfs import GridFS
import json
import pandas as pd
client = MongoClient()
client.database_names()
db = client['MC']
fs = GridFS(db, collection="MC")
db.collection_names(include_system_collections=False)
collectionFiles = db['MC.files']
collectionChunk = db['MC.chunks']
files = db['MC.files'].find({"metadata.Feature0": "00011"})
for n in files:
file_id = n['_id']
chunks = db['MotorCalculo.chunks'].find({"files_id": file_id})
bsondData = (fs.get(file_id).read())
decData = bsondData.decode()
jsonData = json.loads(decData)
F1 = jsonData['Feature1']
F2 = jsonData['Feature2']

If you have enough RAM, it should be faster to access file groups and not make as many calls to mongo.
You can try something like this:
batch_file_id = ['#1', '#2', '#3', '#4']
chunks = db['MotorCalculo.chunks'].find('{\"files_id\" : {\"$in\":[{\"$oid\":\"' + '\"}, {\"$oid\":\"'.join(batch_file_id) + '\"}]}}')
...
batch_file_id
Out[1]: ['#1', '#2', '#3', '#4']
'{\"files_id\" : {\"$in\":[{\"$oid\":\"' + '\"}, {\"$oid\":\"'.join(batch_file_id) + '\"}]}}'
Out[2]: '{"files_id" : {"$in":[{"$oid":"#1"}, {"$oid":"#2"}, {"$oid":"#3"}, {"$oid":"#4"}]}}'
Regards!!

Related

AttributeError: 'NoneType' object has no attribute 'impl'

After running my program, I am getting a Output, but I also get this error message.
Exception ignored in: <function Model.__del__ at 0x7f02ba33b430>
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/deepspeech/__init__.py", line 43, in __del__
AttributeError: 'NoneType' object has no attribute 'impl'
Here is the code - Here I am trying to convert an wv audio file into text using deepspeech library.
from deepspeech import Model
import numpy as np
import os
import wave
import json
from IPython.display import Audio
from IPython.display import clear_output
model_file_path = 'deepspeech-0.8.2-models.pbmm'
lm_file_path = 'deepspeech-0.8.2-models.scorer'
beam_width = 100
lm_alpha = 0.93
lm_beta = 1.18
model = Model(model_file_path)
model.enableExternalScorer(lm_file_path)
model.setScorerAlphaBeta(lm_alpha, lm_beta)
model.setBeamWidth(beam_width)
def read_wav_file(filename):
with wave.open(filename, 'rb') as w:
rate = w.getframerate()
frames = w.getnframes()
buffer = w.readframes(frames)
return buffer, rate
def transcribe(audio_file):
buffer, rate = read_wav_file(audio_file)
data16 = np.frombuffer(buffer, dtype=np.int16)
return model.stt(data16)
print(transcribe('speech.wav'))
Importing IPython is causing the issue, try running your code without it and it should work.
from deepspeech import Model
import numpy as np
import os
import wave
import json
model_file_path = 'deepspeech-0.8.2-models.pbmm'
lm_file_path = 'deepspeech-0.8.2-models.scorer'
beam_width = 100
lm_alpha = 0.93
lm_beta = 1.18
model = Model(model_file_path)
model.enableExternalScorer(lm_file_path)
model.setScorerAlphaBeta(lm_alpha, lm_beta)
model.setBeamWidth(beam_width)
def read_wav_file(filename):
with wave.open(filename, 'rb') as w:
rate = w.getframerate()
frames = w.getnframes()
buffer = w.readframes(frames)
return buffer, rate
def transcribe(audio_file):
buffer, rate = read_wav_file(audio_file)
data16 = np.frombuffer(buffer, dtype=np.int16)
return model.stt(data16)
print(transcribe('speech.wav'))

Can we give Max & Min values statically in normalization using MinMaxScaler Sklearn?

So, I have this doubt and have been looking for answers.
Below is the input post request
{
"emotive_Score": [0.89,0.57,0.089,0,0.004,0,0],
"sentiment_Score": [1.521894,-6.4523187],
"mood_score":[40]
}
And I'm using the following code to scale the values.
from flask import Flask, request
from flask_restful import Resource, Api
from json import dumps
from sklearn import preprocessing
import numpy as np
class MoodScore(Resource):
def post(self):
json_data = request.get_json(force=True)
if not json_data:
return {'message': 'No input data provided'}, 400
x = request.json['emotive_Score']
x1 = request.json['sentiment_Score']
x2 = request.json['mood_score']
#Normalisation for Emotive Score
xEmotive = np.array(x)
PositiveEmotive = str(xEmotive[4]+xEmotive[6])
NegativeEmotive = str(xEmotive[0]+xEmotive[1]+xEmotive[2]+xEmotive[3]+xEmotive[5])
EmotiveScoreArray = (PositiveEmotive,NegativeEmotive)
Nml = np.array(EmotiveScoreArray)
float_array = Nml.astype(np.float)
xM = float_array.reshape(-1,1)
minmaxscaler = preprocessing.MinMaxScaler(feature_range=(0,1))
Emotive = minmaxscaler.fit_transform(xM)
#Normalisation for Sentiment Score
xSentiment = np.array(x1)
PositiveSentiment = str(xSentiment[0])
NegativeSentiment = str(xSentiment[1])
SentimentScoreArray = (PositiveSentiment,NegativeSentiment)
Nml1 = np.array(SentimentScoreArray)
float_array1 = Nml1.astype(np.float)
xM1 = float_array1.reshape(-1,1)
minmaxscaler1 = preprocessing.MinMaxScaler(feature_range=(-1,1))
Sentiment = minmaxscaler1.fit_transform(xM1)
return {'PositiveEmotive':str(Emotive[0]),'NegativeEmotive':str(Emotive[1]),'PositiveSentiment':str(Sentiment[0]),'NegativeSentiment':str(Sentiment[1]),'FinalValue':str(Emotive[0]+Emotive[1]+Sentiment[0]+Sentiment[1])}
# return {'FinalScore': str(Sentiment)}
app = Flask(__name__)
api = Api(app)
api.add_resource(MoodScore, '/moodScore')
if __name__ == '__main__':
app.run(port='5005', host="0.0.0.0")
And I'm getting the following as output.
{
"PositiveEmotive": "[0.]",
"NegativeEmotive": "[1.]",
"PositiveSentiment": "[1.]",
"NegativeSentiment": "[-1.]",
"FinalValue": "[1.]"
}
I just want to know whether I can give static values to Min & Max during normalization calculation so that I can get the desired result as below
{
"PositiveEmotive": "[0.546]",
"NegativeEmotive": "[1.]",
"PositiveSentiment": "[0.598]",
"NegativeSentiment": "[-0.6879.]",
"FinalValue": "[1.4561]"
}

insert_many not working when adding one more case

I used the following code to insert tab0011.json into portal_db.acs:
from pymongo import MongoClient
import json
client = MongoClient()
db = client.portal_db
db.acs.drop()
acs = db.acs
data_acs = json.load(open('/vagrant/data/tab0011.json', 'r'))
result_acs = acs.insert_many(data_acs)
The code has stored the tab0011.json data correclty. However, I tried the following code to insert tab0011.json into portal_db.acs and tab0007.json into portal_db.tab0007. Both collections were created but with none inside, i.e., empty:
from pymongo import MongoClient
import json
client = MongoClient()
db = client.portal_db
db.acs.drop()
acs = db.acs
db.tab0007.drop()
tab0007 = db.tab0007
data_acs = json.load(open('/vagrant/data/tab0011.json', 'r'))
data_tab0007 = json.load(open('/vagrant/data/tab0007.json', 'r'))
result_acs = acs.insert_many(data_acs)
result_tab0007 = tab0007.insert_many(data_tab0007)
Not quite sure why.
If the file extension is .json I am able to read the data via the methods used in your code and insert them into collections in the same database. I can see the data that I used in both the respective collections
Maybe you can try doing it this way:
from pymongo import MongoClient
import json
client = MongoClient(host="localhost", port=27017)
db = client["portal_db"]
acs = db.get_collection("acs")
tab0007 = db.get_collection("tab0007")
db.drop_collection("acs")
db.drop_collection("tab0007")
data_acs = json.load(open('/vagrant/data/tab0011.json', 'r'))
data_tab0007 = json.load(open('/vagrant/data/tab0007.json', 'r'))
acs_inserts = acs.insert_many(data_acs)
tab_inserts = tab0007.insert_many(data_tab0007)
print(acs_insert.inserted_ids)
print(tab_inserts.inserted_ids)
The last two lines would print the ObjectIds of the Documents inserted.

Get "holes" in dates in MogoDB collection

I have a MongoDB collection that stores data for each hour since 2011.
For example:
{
"dateEntity" : ISODate("2011-01-01T08:00:00Z"),
"price" : 0.3
}
{
"dateEntity" : ISODate("2011-01-01T09:00:00Z"),
"price" : 0.35
}
I'd like to know if there are "holes" in that dates. For example, a missing entry at a hour.
Unfortunately, there is no gaps-marking aggregator in Mongodb.
I have checked if it's possible to write an own gaps-aggregator for Mongodb basing on Javascript functions in Map-Reduce pipelines by creating a time raster in the first map stage and then mapping it to its corresponding values, but database reads are discouraged while mapping and reducing, so it would be bad design. So, it is not possible to achieve this with Mongodb-own instruments.
I think, there are two possible solutions.
Solution one: Use a driver like the Java driver
I suggest you could use an idiomatic driver like the Java driver for your Mongodb data and create a raster of hours like in the Test provided.
import com.mongodb.BasicDBObject;
import com.mongodb.MongoClient;
import com.mongodb.ServerAddress;
import com.mongodb.client.MongoCollection;
import org.bson.Document;
import org.junit.Test;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
public class HourGapsTest {
#Test
public void testHourValues() {
String host = "127.0.0.1:27017";
ServerAddress addr = new ServerAddress(host);
MongoClient mongoClient = new MongoClient(addr);
MongoCollection<Document> collection = mongoClient.getDatabase("sotest").getCollection("hourhole");
LocalDateTime start = LocalDateTime.of(2011, 1, 1, 8, 0, 0);
LocalDateTime end = LocalDateTime.of(2011, 1, 2, 0, 0, 0);
List<LocalDateTime> allHours = new ArrayList<>();
for (LocalDateTime hour = start; hour.isBefore(end); hour = hour.plusHours(1L)) {
allHours.add(hour);
}
List<LocalDateTime> gaps = new ArrayList<>();
for (LocalDateTime hour : allHours) {
BasicDBObject filter = new BasicDBObject("dateEntity", new Date(hour.toInstant(ZoneOffset.UTC).toEpochMilli()));
if (!collection.find(filter).iterator().hasNext()) {
gaps.add(hour);
}
}
gaps.forEach(System.out::println);
}
}
Solution two: Use a timeseries database
However, timeseries databases like Kairosdb provide this functionality. Consider storing these time-value data in a timeseries database.

Copy Feature Classes in a String

I am trying to copy only feature classes I specify rather than copying them all using a if loop. I tried the below and it executes, but does not copy the files.
import arcpy
import os
arcpy.env.workspace = r'c:\arcgis\ArcTutor\ModelBuilder\GTKModelbuilder\Data\RFDA Shapefiles'
outWorkspace = r'C:\Output'
fcList = arcpy.ListFeatureClasses()
for shapefile in fcList:
if fcList == ('BedfordCalls.shp','ColleyvilleCalls.shp','HurstCalls.shp','KellersCalls.shp'):
outFeatureClass = os.path.join(outWorkspace,shapefile.strip(".shp"))
arcpy.CopyFeatures_management(shapefile, outFeatureClass)
import arcpy
import os
arcpy.env.workspace = r'c:\Shapefile'
outWorkspace = r'C:\Output'
fcList = arcpy.ListFeatureClasses()
for shapefile in fcList:
if'BedfordCalls.shp'or'ColleyvilleCalls.shp'or'HurstCalls.shp'or'KellersCalls.shp' in fcList:
outFeatureClass = os.path.join(outWorkspace,shapefile.strip(".shp"))
arcpy.CopyFeatures_management(shapefile, outFeatureClass)
The below should do what you want more effectively (noting comments on earlier answer):
import arcpy
import os
arcpy.env.workspace = r'c:\Shapefile'
outWorkspace = r'C:\Output'
fcList = ['BedfordCalls.shp', 'ColleyvilleCalls.shp', 'HurstCalls.shp', 'KellersCalls.shp']
for shapefile in fcList:
outFeatureClass = os.path.join(outWorkspace, shapefile)
arcpy.CopyFeatures_management(shapefile, outFeatureClass)