Checking to see if record exists in MongoDB before Scrapy inserts - mongodb

As the title implies, I'm running a Scrapy spider and storing results in MongoDB. Everything is running smoothly, except when I re-run the spider, it adds everything again, and I don't want the duplicates. My pipelines.py file looks like this:
import logging
import pymongo
from pymongo import MongoClient
from scrapy.conf import settings
from scrapy import log
class MongoPipeline(object):
collection_name = 'openings'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
if self.db.openings.find({' quote_text': item['quote_text']}) == True:
pass
else:
self.db[self.collection_name].insert(dict(item))
logging.debug("Post added to MongoDB")
return item
My spider looks like this:
import scrapy
from ..items import QuotesItem
class QuoteSpider(scrapy.Spider):
name = 'quote'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
items = QuotesItem()
quotes = response.xpath('//*[#class="quote"]')
for quote in quotes:
author = quote.xpath('.//*[#class="author"]//text()').extract_first()
quote_text = quote.xpath('.//span[#class="text"]//text()').extract_first()
items['author'] = author
items['quote_text'] = quote_text
yield items
The current syntax is obviously wrong, but is there a slight fix to the for loop to make to fix it? Should I be running this loop in the spider instead? I was also looking at upsert but was having trouble understanding how to use that effectively. Any help would be great.

Looks like you have a leading space here: self.db.openings.find({' quote_text': item['quote_text']}). I suppose it should just be 'quote_text'?
You should use is True instead of == True. This is the reason it adds everything again.
I would suggest to use findOne instead of find, will be more efficient.
Using upsert instead is indeed a good idea but the logic will be slightly different: you will update the data if the item already exists, and insert it when it doesn't exists (instead of not doing anything if the item already exists). The syntax should look something like this: self.db[self.collection_name].update({'quote_text': quote_text}, dict(item),upsert=True)

steps :
check if the collection is empty else : write in collection
if not empty and item exist : pass
else (collection not empty + item dosen't exist) : write in collection
code:
def process_item(self, item, spider):
## how to handle each post
# empty
if len(list(self.db[self.collection_name].find({}))) == 0 :
self.db[self.collection_name].insert_one(dict(item))
# not empty
elif item in list(self.db[self.collection_name].find(item,{"_id":0})) :
print("item exist")
pass
else:
print("new item")
#print("here is item",item)
self.db[self.collection_name].insert_one(dict(item))
logging.debug("Post added to MongoDB")
return item

Related

Mock inner class's attributes using MagicMock

Apologies for a length post. I have been trying to beat my head around reading about mock, MagicMock, and all the time getting confused. Hence, decided to write this post.
I know several questions, and pages have been written on this. But, still not able to wrap my head around this.
My Setup:
All the test code, and the 2 module files come under one "folder" mymodule
my_module_1.py file contains
class MyOuterClass(object):
MyInnerClass(object):
attribute1: str
attribute2: str
attribute3: str
def get(self) -> MyInnerClass:
'''
pseudocode
1. a call to AWS's service is made
2. the output from call in step 1 is used to set attributes of this InnerClass
3. return innerclass instance
'''
I use the OuterClass in another file(my_module_2.py), to set some values and return a string as follows:
class MyModule2():
def get_foo(self, some_boolean_predicate):
if some_boolean_predicate:
temp = my_module_1.OuterClass().get()
statement = f'''
WITH (
BAR (
FIELD_1 = '{temp.attribute1}',
FIELD_2 = '{temp.attribute2}',
FIELD_3 = '{temp.attribute3}'
)
)
'''
else:
statement = ''
return statement
I want to write the unit tests for the file my_module_2.py file, and test the function get_foo
How I am writing the tests(or planning on)
a test file by name test_my_module2.py
I started with creating a pytest.fixture for the MyOuterClass's get function as follows since I will be reusing this piece of info again in my other tests
#pytest.fixture
def mock_get(mocker: MockerFixture) -> MagicMock:
return mocker.patch.object(MyOuterClass, 'get')
Finally,
Then I proceeded to use this fixture in my test as follows:
from unittest import mock
from unittest.mock import MagicMock, Mock, patch, PropertyMock
import pytest
from pytest_mock import MockerFixture
from my_module.my_module_1 import myOuterClass
def test_should_get_from_inner_class(self, mock_get):
# mock call to get are made
output = mock_get.get
#update the values for the InnerClass's attributes here
output.attribute1.side_effect = 'attr1'
output.attribute2.side_effect = 'attr2'
output.attribute3.side_effect = 'attr3'
mock_output_str = '''
WITH (
BAR (
FIELD_1 = 'attr1',
FIELD_2 = 'attr2',
FIELD_3 = 'attr3'
)
)
'''
module2Obj = MyModule2()
response = module2Obj.get_foo(some_boolean_predicate=True)
# the following assertion passes
assert mock_get.get.called_once()
# I would like match `response to that with mock_output_str instance above
assert response == mock_output_str
But, the assertion as you might have guessed failed, and I know I am comparing completely different types, since I see
errors such as
FAILED [100%]
WITH (
BAR (
FIELD1 = '<MagicMock name='get().attr1' id='4937943120'>',
FIELD3 = '<MagicMock name='get().attr2' id='4937962976'>',
FIELD3 = '<MagicMock name='get().attr3' id='4937982928'>'
)
)
Thank you for being patient with me till here, i know its a really lengthy post, but stuck on this for a few days, ended up creating a post here.
How do i get to validate the mock's value with the mock_output_str?
yess! the hint was in the #gold_cy's answer. I was only calling my mock and never setting its values
this is what my test case ended up looking
mock_obj = OuterClass.InnerClass()
mock_obj.attribute1='some-1'
mock_obj.attribute2='some-2'
mock_obj.attribute3='some-3'
mock_get.return_value = mock_obj
once my mock was setup properly, then the validation became easy! Thank you!

APScheduler update database only once

I'm trying to get APScheduler to update in my Flask app the Postgresql database every 5 minutes, but the database is only updated the first time, all subsequent times the changes are not saved. APScheduler itself works correctly, and if the function of updating the database is replaced with the function of displaying text, then everything works correctly every time.
In my app im using Flask-SQLAlchemy:
SQLALCHEMY_DATABASE_URI = 'postgresql+psycopg2://postgres:name#localhost/name'
The APScheduler code looks like this:
from apscheduler.schedulers.blocking import BlockingScheduler
sched = BlockingScheduler(daemon=True)
sched.add_job(func=update, trigger='interval', minutes=5)
sched.start()
The database update function looks like this:
def update():
for i in data:
for row in Names.query:
if row.id == i['id']:
row.name = i['name']
row.gender = i['gender']
row.age = i['age']
db.session.commit()
In the logs, APScheduler always works successfully. I also looked at the Postgresql logs, where I found this phrase: 'An existing connection was forcibly closed by the remote host.'
I suspect it might be the database engine and sessions, but I haven't found the instructions I need to implement within the Flask-SQLAlchemy package.
Versions of pacages:
Flask-SQLAlchemy==2.4.1
SQLAlchemy==1.3.17
APScheduler==3.6.3
db Model:
class Names(db.Model):
__searchable__ = ['name', 'age']
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(40))
gender = db.Column(db.String(40))
age = db.Column(db.Numeric)
def __repr__(self):
return '<Names %r>' % self.id
I think I figured out what the problem is. APScheduler somehow caches the contents of a variable the first time it is used and then only uses that value.
Before the function, I have the following code:
request = requests.get('https://privateapi')
data = request.json()
Then the function takes data from data:
def update():
for i in data:
for row in Names.query:
if row.id == i['id']:
row.name = i['name']
row.gender = i['gender']
row.age = i['age']
db.session.commit()
According to the Flask-SQLAlchemy logs, the data is written to the database successfully. I tried adding to the print (data) function so that every 5 minutes it would show me the contents of the data variable and I saw that its contents were not updated.
It turns out that the data is written to the database, but with the same values, so I don't see its update.
Then I tried shortening the request path and not saving its content to a variable:
def update():
for i in requests.get('https://privateapi').json():
for row in Names.query:
if row.id == i['id']:
row.name = i['name']
row.gender = i['gender']
row.age = i['age']
db.session.commit()
But here again nothing has changed.
UPDATE:
I solved this problem by removing the data variable at the end of the function:
def update():
name = requests.get('https://privateapi').json()
for i in name:
for row in Names.query:
if row.id == i['id']:
row.name = i['name']
row.gender = i['gender']
row.age = i['age']
del name
db.session.commit()

H2 database content is not persisting on insert and update

I am using h2 database to test my postgres slick functionality.
I created a below h2DbComponent:
trait H2DBComponent extends DbComponent {
val driver = slick.jdbc.H2Profile
import driver.api._
val h2Url = "jdbc:h2:mem:test;MODE=PostgreSQL;DB_CLOSE_DELAY=-1;DATABASE_TO_UPPER=false;INIT=runscript from './test/resources/schema.sql'\\;runscript from './test/resources/schemadata.sql'"
val logger = LoggerFactory.getLogger(this.getClass)
val db: Database = {
logger.info("Creating test connection ..................................")
Database.forURL(url = h2Url, driver = "org.h2.Driver")
}
}
In the above snippet i am creating my tables using schema.sql and inserting a single row(record) with schemadata.sql.
Then i am trying to insert a record into the table as below using my test case:
class RequestRepoTest extends FunSuite with RequestRepo with H2DBComponent {
test("Add new Request") {
val response = insertRequest(Request("XYZ","tk", "DM", "RUNNING", "0.1", "l1", "file1",
Timestamp.valueOf("2016-06-22 19:10:25"), Some(Timestamp.valueOf("2016-06-22 19:10:25")), Some("scienceType")))
val actualResult=Await.result(response,10 seconds)
assert(actualResult===1)
val response2 = getAllRequest()
assert(Await.result(response2, 5 seconds).size === 2)
}
}
The above assert of insert works fine stating that the record is inserted. But the getAllRequest() assert fails as the output still contains the single row(as inserted by schemadata.sql) => which means the insertRequest change is not persisted. However the below statements states that the record is inserted as the insert returned 1 stating one record inserted.
val response = insertRequest(Request("CMP_XYZ","tesco_uk", "DM", "RUNNING", "0.1", "l1", "file1",
Timestamp.valueOf("2016-06-22 19:10:25"), Some(Timestamp.valueOf("2016-06-22 19:10:25")),
Some("scienceType")))
val actualResult=Await.result(response,10 seconds)
Below is my definition of insertRequest:
def insertRequest(request: Request):Future[Int]= {
db.run { requestTableQuery += request }
}
I am unable to figure out how can i see the inserted record. Is there any property/config which i need to add?
But the getAllRequest() assert fails as the output still contains the single row(as inserted by schemadata.sql) => which means the insertRequest change is not persisted
I would double-check that the assert(Await.result(response2, 5 seconds).size === 2) line is failing because of a size difference. Could it be failing for some other general failure?
For example, as INIT is run on each connection it could be that you are re-creating the database for each connection. Unless you're careful with the SQL, that could produce an error such as "table already exists". Adding TRACE_LEVEL_SYSTEM_OUT=2; to your H2 URL can be helpful in tracking what H2 is doing.
A couple of suggestions.
First, you could ensure your SQL only runs as needed. For example, your schema.sql could add checks to avoid trying to create the table twice:
CREATE TABLE IF NOT EXISTS my_table( my_column VARCHAR NULL );
And likewise for your schemadata.sql:
MERGE INTO my_table KEY(my_column) VALUES ('a') ;
Alternatively, you could establish schema and test data around your tests (e.g., possibly in Scala code, using Slick). Your test framework probably has a way to ensure something is run before and after a test or test suit.

need help in understanding if the way I am testing a function is correct

I have written this function which is called when a user clicks a link. The function basically creates a copy of the user data with one field altered (thus keeping the original value unchanged i.e. not-mutable) and then updates the database with the new value
def confirmSignupforUser(user:User):Future[Option[User]] = {
println("confirming user: "+user)
val newInternalProfile = user.profile.internalProfileDetails.get.copy(confirmed=true)//new data which should be added in the database
println("old internal profile: "+user.profile.internalProfileDetails.get)
println("new internal profile: "+newInternalProfile)
val newProfile = UserProfile(Some(newInternalProfile),user.profile.externalProfileDetails)
println("old profile: "+user.profile)
println("new profile: "+newProfile)
val confirmedUser = user.copy(profile=newProfile)
for(userOption <- userRepo.update(confirmedUser)) yield { //database operation
println("returning modified user:"+userOption)
userOption
}
}
To test the code, I have written the following spec
"confirmSignupforUser" should {
"change confirmed status to True" in {
val testEnv = new TestEnv(components.configuration)
val externalProfile = testEnv.externalUserProfile
val internalUnconfirmedProfile = InternalUserProfile(testEnv.loginInfo,1,false,None)
val internalConfirmedProfile = internalUnconfirmedProfile.copy(confirmed=true)
val unconfirmedProfile = UserProfile(Some(internalUnconfirmedProfile),externalProfile)
val confirmedProfile = UserProfile(Some(internalConfirmedProfile),externalProfile)
val origUser = User(testEnv.mockHelperMethods.getUniqueID(),unconfirmedProfile)
val confirmedUser = origUser.copy(profile = confirmedProfile)
//the argument passed to update is part of test. The function confirmSignupforUser should pass a confirmed profile
when(testEnv.mockUserRepository.update(confirmedUser)).thenReturn(Future{Some(confirmedUser)})
//// await is from play.api.test.FutureAwaits
val updatedUserOption:Option[User] = await[Option[User]](testEnv.controller.confirmSignupforUser(origUser))
println(s"received updated user option ${updatedUserOption}")
updatedUserOption mustBe Some(confirmedUser)
}
}
I am not confident if I am testing the method correctly. The only way I can check that the confirmed field got changed is by looking at the return value of confirmSignupforUser. But I am actually mocking the value and I have already set the field confirmed to true in the mocked value (when(testEnv.mockUserRepository.update(confirmedUser)).thenReturn(Future{Some(confirmedUser)}).
I know the code works because in the above mock, the update method expects confirmedUser or in other words, a user with confirmed field set to true. So if my code wasn't working, update would have been called with user whose confirmed field was false and mockito would have failed.
Is this the right way to test the method or is there a better way?
You don't need to intialize internalConfirmedProfile in your test. The whole point is to start with confirmed=false, run the confirmSignupforUser method, and make sure that the output is confirmed=true.
You should check 2 things:
check that the return value has confirmed=true (which you do)
check that the repository has that user saved with confirmed=true (which you don't check). To check that you would need to load the user back from the repository at the end.

PySide - QSortFilterProxyModel and QListView - indexWidget pointer get deleted when filtering

I've a problem with a custom QListView I'm trying to make, here the problem:
I'm using QListView to show a list of QWidget by using QListView.setIndexWidget(index,widget).
This is working pretty fine, but now I want to filter the items model by using QSortFilterProxyModel()
with .setFilterWildcard()
It is not working very well because the second time the model is filtered
I got error like this :
RuntimeError: Internal C++ object (PySide.QtGui.QLabel) already deleted.
Without using filtering and QSortFilterProxyModel everything works fine, but it seems I'm missing
something with the filtering operation, the indexWidget() is deleted when using filtering :(
here a sample code where you can reproduce the bug, when list view is shown, hit 1,2 or 3 keyboard
key to activate filtering ( Backspace to set filtering empty to show all items )
Here the sample code to reproduce the problem:
import PySide.QtGui as QtGui
import PySide.QtCore as QtCore
_DEFAULT_ITEM_SIZE = QtCore.QSize(100, 85)
_USER_ROLE = QtGui.QStandardItem.UserType + 1
class CustomItemWidget(QtGui.QWidget):
def __init__(self, parent=None):
super(CustomItemWidget, self).__init__(parent=parent)
#self.setAutoFillBackground(True)
self.main_layout = QtGui.QVBoxLayout(self)
self.label = QtGui.QLabel(self)
self.main_layout.addWidget(self.label)
def paintEvent(self, event):
painter = QtGui.QPainter(self)
painter.setRenderHint(QtGui.QPainter.Antialiasing)
# Default brush and pen
bg_brush = QtGui.QBrush(QtGui.QColor("#8C8C8C"))
pen = QtCore.Qt.NoPen
painter.save()
painter.setPen(pen)
painter.setBrush(bg_brush)
painter.drawRoundedRect(self.rect(), 12, 12)
painter.restore()
def setData(self, role, value):
if role == QtCore.Qt.DisplayRole:
self.label.setText(value)
class CustomItem(QtGui.QStandardItem):
def __init__(self):
super(CustomItem, self).__init__()
self.number = None
self.item_widget = CustomItemWidget()
self.setSelectable(True)
def type(self):
return _USER_ROLE
def data(self, role):
if role == QtCore.Qt.DisplayRole:
value = "DATA %s" % str(self.number)
self.item_widget.setData(role, value)
return value
if role == QtCore.Qt.SizeHintRole:
return _DEFAULT_ITEM_SIZE
return QtGui.QStandardItem.data(self, role)
class CustomItemDelegate(QtGui.QStyledItemDelegate):
def __init__(self, parent=None):
super(CustomItemDelegate, self).__init__(parent=parent)
class CustomItemModel(QtGui.QStandardItemModel):
def __init__(self, parent=None):
super(CustomItemModel, self).__init__(parent)
def flags(self, index):
return QtCore.Qt.ItemIsEnabled | \
QtCore.Qt.ItemIsSelectable | \
QtCore.Qt.ItemIsDragEnabled | \
QtCore.Qt.ItemIsDropEnabled
class CustomItemFilterProxyModel(QtGui.QSortFilterProxyModel):
def __init__(self, parent=None):
super(CustomItemFilterProxyModel, self).__init__(parent)
self.setDynamicSortFilter(True)
self.setFilterCaseSensitivity(QtCore.Qt.CaseInsensitive)
self.setFilterKeyColumn(0)
class CustomView(QtGui.QListView):
def __init__(self, parent=None):
super(CustomView, self).__init__(parent=parent)
self.setIconSize(_DEFAULT_ITEM_SIZE)
self.setMovement(QtGui.QListView.Static)
self.setSelectionMode(QtGui.QAbstractItemView.ExtendedSelection)
self.setSelectionBehavior(QtGui.QAbstractItemView.SelectItems)
self.setViewMode(QtGui.QListView.IconMode)
self.setUniformItemSizes(True)
self.setFlow(QtGui.QListView.LeftToRight)
self.setResizeMode(QtGui.QListView.Adjust)
self.data_model = CustomItemModel(self)
self.proxy_model = CustomItemFilterProxyModel(self)
self.proxy_model.setSourceModel(self.data_model)
self.setModel(self.proxy_model)
def keyPressEvent(self, event):
if event.key() == QtCore.Qt.Key_1:
self.proxy_model.setFilterWildcard("*1*")
print self.proxy_model.filterRegExp()
if event.key() == QtCore.Qt.Key_2:
self.proxy_model.setFilterWildcard("*2*")
print self.proxy_model.filterRegExp()
if event.key() == QtCore.Qt.Key_3:
self.proxy_model.setFilterWildcard("*3*")
print self.proxy_model.filterRegExp()
if event.key() == QtCore.Qt.Key_Backspace:
self.proxy_model.setFilterFixedString("")
print self.proxy_model.filterRegExp()
if event.key() == QtCore.Qt.Key_Plus:
self.addNewItem()
QtGui.QListView.keyPressEvent(self, event)
def addNewItem(self):
item = CustomItem()
item.number = self.data_model.rowCount()
self.addItem(item)
def addItem(self, item):
self.data_model.appendRow(item)
proxy_index = self.proxy_model.mapFromSource(item.index())
self.setIndexWidget(proxy_index, item.item_widget)
if __name__ == '__main__':
import sys
qapplication = QtGui.QApplication(sys.argv)
layout = QtGui.QVBoxLayout()
window = QtGui.QDialog()
window.setLayout(layout)
view = CustomView(window)
view.resize(800, 600)
layout.addWidget(view)
for i in range(0, 10):
item = CustomItem()
item.number = i
view.addItem(item)
window.show()
sys.exit(qapplication.exec_())
or sample code here:
https://gist.github.com/66e29df303d1f1825a53
Can someone please help me on this? is this a known bug ? or I'm doing it completely wrong :P
Thanks in advance for your help.
This is an old question, but as I struggled with a similar problem for quite a while, here the solution I found and a possible explanation:
Instead of caching the custom widget on the model item, I cached the data needed to create the widget. In my case, I wanted to use a custom label with html in order to be able to format parts of text in different colour. Hence, I cached the html string on the item.
Then, in the initStyleOption method of the item delegate, I recreated the widget if it didn't yet exist or had disappeared after filtering:
label = self.parent().indexWidget(modelIndex)
if not label:
label = CustomLabel(item.html)
self.parent().setIndexWidget(modelIndex, label)
The reason why filtering deletes the widget cached on the item is as follows, I believe: the widget can "exist" only in one place. When it is put as indexWidget, it "exists" on a row in the view, not in an item of the model any more. As filtering removes rows from view, widgets on those rows get deleted. - A poor explanation, but I've often got similar surprises when manipulating html elements with JavaScript if I've forgotten to clone the element.