Twisted (Scrapy) and Postgres - postgresql

Im using Scrapy (aka Twisted) and also Postgres as a database.
After I while my connections seem to fill up and then my script is been stuck. I checked this with this query SELECT * FROM pg_stat_activity; and read that its caused because Postgres has no connection pool.
I read about txpostgres and PGBouncer, Bouncer regrettably isn't an option, what else can I do to avoid this problem?
So far I use the following pipeline:
import psycopg2
from twisted.enterprise import adbapi
import logging
from datetime import datetime
import scrapy
from scrapy.exceptions import DropItem
class PostgreSQLPipeline(object):
""" PostgreSQL pipeline class """
def __init__(self, dbpool):
self.logger = logging.getLogger(__name__)
self.dbpool = dbpool
#classmethod
def from_settings(cls, settings):
dbargs = dict(
host=settings['POSTGRESQL_HOST'],
database=settings['POSTGRESQL_DATABASE'],
user=settings['POSTGRESQL_USER'],
password=settings['POSTGRESQL_PASSWORD'],
)
dbpool = adbapi.ConnectionPool('psycopg2', **dbargs)
return cls(dbpool)
def process_item(self, item, spider):
d = self.dbpool.runInteraction(self._insert_item, item, spider)
d.addErrback(self._handle_error, item, spider)
d.addBoth(lambda _: item)
return d
def _insert_item(self, txn, item, spider):
"""Perform an insert or update."""
now = datetime.utcnow().replace(microsecond=0).isoformat(' ')
txn.execute(
"""
SELECT EXISTS(
SELECT 1
FROM expose
WHERE expose_id = %s
)
""", (
item['expose_id'],
)
)
ret = txn.fetchone()[0]
if ret:
self.logger.info("Item already in db: %r" % (item))
txn.execute(
"""
UPDATE expose
SET last_seen=%s, offline=0
WHERE expose_id=%s
""", (
now,
item['expose_id']
)
)
else:
self.logger.info("Item stored in db: %r" % (item))
txn.execute("""
INSERT INTO expose (
expose_id,
title
) VALUES (%s, %s)
""", (
item['expose_id'],
item['title']
)
)
# Write image info (path, original url, ...) to db, CONSTRAIN to expose.expose_id
for image in item['images']:
txn.execute(
"""
INSERT INTO image (
expose_id,
name
) VALUES (%s, %s)
""", (
item['expose_id'],
image['path'].replace('full/', '')
)
)
def _handle_error(self, failure, item, spider):
"""Handle occurred on db interaction."""
# do nothing, just log
self.logger.error(failure, failure.printTraceback())

Related

How to interpret an XCOM within a task_group to keep a history of dynamically generated tasks within it?

I implemented a dag with a task_group that loops on the contents of a file.. This isn't ideal, I wish I could loop through the contents of an XCom, unless I'm mistaken, that isn't possible.
import json
import logging
from airflow.decorators import task_group
from airflow.operators.dummy import DummyOperator
from dtm.migration.helpers.tasks import (
check_compliance_of_objects_sizes,
)
from airflow.providers.google.cloud.transfers.gcs_to_gcs import GCSToGCSOperator
from airflow.providers.google.cloud.sensors.gcs import (
GCSObjectExistenceSensor,
)
#task_group(group_id="copy_task_group")
def copy_and_verify(
inp_parameters_path: str,
impersonated_service_account: str,
):
try:
with open(inp_parameters_path, "r") as f:
inp_parameters = json.load(f)
for entry in inp_parameters:
sensor_source_file = GCSObjectExistenceSensor(
task_id=f"sensor_source_file_{entry}",
bucket=inp_parameters[entry]["source"]["bucket"],
object=(
f"{inp_parameters[entry]['source']['prefix']}"
f"/{inp_parameters[entry]['source']['object']}"
),
impersonation_chain=impersonated_service_account,
)
copy_file = GCSToGCSOperator(
task_id=f"copy_file_{entry}",
source_bucket=inp_parameters[entry]["source"]["bucket"],
source_object=(
f"{inp_parameters[entry]['source']['prefix']}"
f"/{inp_parameters[entry]['source']['object']}"
),
destination_bucket=inp_parameters[entry]["destination"][
"bucket"
],
destination_object=(
f"{inp_parameters[entry]['destination']['prefix']}"
f"/{inp_parameters[entry]['destination']['object']}"
),
impersonation_chain=impersonated_service_account,
)
check_size = check_compliance_of_objects_sizes(
date_of_execution="{{ds_nodash}}",
data_to_check=inp_parameters[entry],
impersonation_chain=impersonated_service_account,
)
sensor_destination_file = GCSObjectExistenceSensor(
task_id=f"sensor_destination_file_{entry}",
bucket=inp_parameters[entry]["destination"]["bucket"],
object=(
f"{inp_parameters[entry]['destination']['prefix']}"
f"/{inp_parameters[entry]['destination']['object']}"
),
impersonation_chain=impersonated_service_account,
)
end_op = DummyOperator(task_id=f"end_{entry}")
(
sensor_source_file
>> copy_file
>> check_size
>> sensor_destination_file
>> end_op
)
except FileNotFoundError:
logging.info(
f"File {inp_parameters_path} is generated in a prior task."
)
As you can see, I used try...except FileNotFoundError: because the error (file not found) is raised on the dag creation since the inp_parameters_path hadn't been created yet by the upstream tasks.
The inp_parameters_path is a file that is created upstream, but unfortunately, it seems that in order to display the logged steps of the task_group post execution I need to keep this file.. Which I don't because it will change from one day to another.
Example of removal of task_group contents after execution:
The contents were well present before the remove_mapping_file has run.
How to pass an XCom to a task_group the same way I can do it with a task? If that's not possible, how do I archive those created inp_parameters_path files per execution so that I can come back and browse the execution of the dag?
If it can help, here's how my dag operates :
#dag(
catchup=False,
schedule_interval="#daily",
max_active_runs=1,
dag_id="migrate_dtm_gcs_data_history",
start_date=datetime(2022, 12, 1),
dagrun_timeout=timedelta(minutes=20),
tags=[
"migration",
"dtm",
],
default_args=default_args,
)
def template_dag():
map = convert_csv_migration_to_map(
source_csv=f"{COMPOSER_GCS_LOC_PATH}/dags/dtm/migration/helpers/gcs_migration_mapping_archetype.csv",
delimiter=",",
)
updated_map_1 = update_mapping_with_gcp_project_and_buckets_ids(
src_mig_map=map,
env="dev",
impersonation_chain=IMPERSONATED_SERVICE_ACCOUNT,
)
list_files_and_prefixes = GCSListObjectsOperator(
task_id="list_files_and_prefixes",
bucket=LEGACY_DTM_HISTORY_BUCKET,
prefix="raw_data/datamart/{{ds_nodash}}",
impersonation_chain=IMPERSONATED_SERVICE_ACCOUNT,
)
updated_map_2 = match_data_with_migration_map(
src_mig_map=updated_map_1,
files_and_prefixes=list_files_and_prefixes.output,
)
list_of_maps = flatten_to_input_files_granularity(
src_mig_map=updated_map_2,
)
gcs_migration_maps_path = (
f"{COMPOSER_GCS_LOC_PATH}/dags/dtm/migration"
f"/helpers/gcs_migration_list_of_maps_{{ds_nodash}}.json"
)
write_list_of_maps_into_dict = gen_dict_file_from_list_of_map(
file_name=gcs_migration_maps_path,
list_of_maps=list_of_maps,
)
copy_task_group = copy_and_verify(
inp_parameters_path=gcs_migration_maps_path,
impersonated_service_account=IMPERSONATED_SERVICE_ACCOUNT,
)
remove_mapping_file = BashOperator(
task_id="remove_mapping_file",
bash_command=f"rm -vf {gcs_migration_maps_path}",
)
(
map
>> updated_map_1
>> list_files_and_prefixes
>> updated_map_2
>> list_of_maps
>> write_list_of_maps_into_dict
>> copy_task_group
>> remove_mapping_file
)
dag = template_dag()

AWS GlueStudio RDS -> Redshift invalid timestamp format

I am trying to create an AWS Glue ETL job to move data from Aurora RDS to Redshift, but cannot resolve how to get the timestamp fields properly mapped. All stages of the job show a valid preview of the expected data, but the job always fails with the following error
py4j.protocol.Py4JJavaError: An error occurred while calling o179.pyWriteDynamicFrame.
: java.sql.SQLException:
Error (code 1206) while loading data into Redshift: "Invalid timestamp format or value [YYYY-MM-DD HH24:MI:SS]"
Table name: public.stage_table_ae89e9dffe974b649bbf4852e49a4b12
Column name: updated_at
Column type: timestamp(0)
Raw line: 1234,5341,1121,0,2022-01-06 16:29:55.000000000,2022-01-06 16:29:55.000000000,1,1,Suzy
Raw field value: 0
I have tried doing a date format to remove the microseconds, I have tried forcing quotes around the date fields, nothing works.
Here is the generated script
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrameCollection
from awsglue.dynamicframe import DynamicFrame
from awsglue import DynamicFrame
# Script generated for node Custom transform
def CastIntsTransform(glueContext, dfc) -> DynamicFrameCollection:
df = dfc.select(list(dfc.keys())[0])
df_resolved = (
df.resolveChoice(specs=[("id", "cast:bigint")])
.resolveChoice(specs=[("user_id", "cast:bigint")])
.resolveChoice(specs=[("connected_user_id", "cast:bigint")])
.resolveChoice(specs=[("mg_id", "cast:bigint")])
.resolveChoice(specs=[("access_level", "cast:tinyint")])
.resolveChoice(specs=[("status", "cast:tinyint")])
)
return DynamicFrameCollection({"CustomTransform0": df_resolved}, glueContext)
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
for alias, frame in mapping.items():
frame.toDF().createOrReplaceTempView(alias)
result = spark.sql(query)
return DynamicFrame.fromDF(result, glueContext, transformation_ctx)
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node JDBC Connection
JDBCConnection_node1 = glueContext.create_dynamic_frame.from_catalog(
database="ABC123",
table_name="user_connections",
transformation_ctx="JDBCConnection_node1",
)
# Script generated for node SQL
SqlQuery0 = """
select
id,
user_id,
connected_user_id,
COALESCE(mg_id, 0) mg_id,
created_at,
updated_at,
updated_at,
access_level,
status,
COALESCE(nickname, '') nickname
from
apiData
"""
SQL_node1647619002820 = sparkSqlQuery(
glueContext,
query=SqlQuery0,
mapping={"apiData": JDBCConnection_node1},
transformation_ctx="SQL_node1647619002820",
)
# Script generated for node Custom transform
Customtransform_node1647612655336 = CastIntsTransform(
glueContext,
DynamicFrameCollection(
{"SQL_node1647619002820": SQL_node1647619002820}, glueContext
),
)
# Script generated for node Select From Collection
SelectFromCollection_node1647613332516 = SelectFromCollection.apply(
dfc=Customtransform_node1647612655336,
key=list(Customtransform_node1647612655336.keys())[0],
transformation_ctx="SelectFromCollection_node1647613332516",
)
# Script generated for node ApplyMapping
ApplyMapping_node2 = ApplyMapping.apply(
frame=SelectFromCollection_node1647613332516,
mappings=[
("id", "bigint", "id", "bigint"),
("user_id", "bigint", "user_id", "bigint"),
("connected_user_id", "bigint", "connected_user_id", "bigint"),
("mg_id", "bigint", "mg_id", "bigint"),
("created_at", "timestamp", "created_at", "timestamp"),
("updated_at", "timestamp", "updated_at", "timestamp"),
("access_level", "tinyint", "access_level", "tinyint"),
("status", "tinyint", "status", "tinyint"),
("nickname", "varchar", "nickname", "varchar"),
],
transformation_ctx="ApplyMapping_node2",
)
# Script generated for node Amazon Redshift
pre_query = "drop table if exists public.stage_table_cd5d65739d334453938f090ea1cb2d6e;create table public.stage_table_cd5d65739d334453938f090ea1cb2d6e as select * from public.test_user_connections where 1=2;"
post_query = "begin;delete from public.test_user_connections using public.stage_table_cd5d65739d334453938f090ea1cb2d6e where public.stage_table_cd5d65739d334453938f090ea1cb2d6e.id = public.test_user_connections.id; insert into public.test_user_connections select * from public.stage_table_cd5d65739d334453938f090ea1cb2d6e; drop table public.stage_table_cd5d65739d334453938f090ea1cb2d6e; end;"
AmazonRedshift_node1647612972417 = glueContext.write_dynamic_frame.from_jdbc_conf(
frame=ApplyMapping_node2,
catalog_connection="ABC123",
connection_options={
"database": "test",
"dbtable": "public.stage_table_cd5d65739d334453938f090ea1cb2d6e",
"preactions": pre_query,
"postactions": post_query,
},
redshift_tmp_dir=args["TempDir"],
transformation_ctx="AmazonRedshift_node1647612972417",
)
job.commit()

pg8000.core.ProgrammingError: 'could not determine data type of parameter $2'

I'm using pg.8000 (Postgres) and trying to run the following SELECT query
cursor.execute(
"""
SELECT orders.name, orders.order_price, orders.selling_price, orders.earnings
FROM member, orders
WHERE member.id=orders.member_id
AND member.name = %s
""",
member_username
)
Where member.username is a String.
But I am getting the following error.
pg8000.core.ProgrammingError: ('ERROR', 'ERROR', '42P18', 'could not determine data type of parameter $2', 'postgres.c', '1350', 'exec_parse_message', '', '')
However, when I run same query using GUI tool, everything runs fine and I get the results. What is the problem?
You passed the parameter wrong, you should give a tuple, a list or a dictionary
Example with a tuple:
cursor.execute(
"""
SELECT orders.name, orders.order_price, orders.selling_price, orders.earnings
FROM member, orders
WHERE member.id=orders.member_id
AND member.name = %s
""",
(member_username,)
)
Example with a list:
cursor.execute(
"""
SELECT orders.name, orders.order_price, orders.selling_price, orders.earnings
FROM member, orders
WHERE member.id=orders.member_id
AND member.name = %s
""",
[member_username]
)
Example with a dictionary:
cursor.execute(
"""
SELECT orders.name, orders.order_price, orders.selling_price, orders.earnings
FROM member, orders
WHERE member.id=orders.member_id
AND member.name = %(mname)s
""",
{'mname' : member_username}
)
http://initd.org/psycopg/docs/usage.html#query-parameters

Python3: IMAP -- cannot refer to tuples as elements

I am working on a script to automatically login to a gmail account and write the email attachments. I have working code, but I am currently trying to refactor into a cleaner layout. I am having trouble figuring out how to convert tuples into variables that can be returned and then passed as arguments into subsequent functions. Is this even possible within this type of solution?
The login and filter work, but I get the following error, having to do with the typ, email_selection item: NameError: name 'email_selection' is not defined
import email, getpass, imaplib, os, sys
from date_range import start_date, end_date
#enter_credentials
user_name = input('Enter your gmail username:\n')
password = getpass.getpass('Enter your password:\n')
imap_key = imaplib.IMAP4_SSL('imap.gmail.com',993)
#login_to_the_mailbox
def login_to_the_mailbox(imap_key, user_name, password):
typ, login_attempt = imap_key.login(user_name, password)
if typ != 'OK':
print ('Not able to login!')
else:
print ('Login successful!')
#filter_the_mailbox
def filter_the_mailbox(imap_key, start_date, end_date):
imap_key.select('Inbox')
typ, email_selection = imap_key.search(None, "(UNSEEN)", "(SENTSINCE {0})".format(start_date), "(SENTBEFORE {0})".format(end_date))
if typ != 'OK':
print ('Not able to filter mailbox.')
else:
print ('Mailbox filtered!')
return email_selection
#fetch_the_mail
def fetch_the_mail(imap_key, email_selection):
for msg_id in email_selection[0].split():
typ, message_parts = imap_key.fetch(msg_id, '(RFC822)')
if typ != 'OK':
print ('Error fetching mail.')
else:
print ('Mail fetched!')

AutoRollback doesn't rollback

After I run the following spec, the table exists. I expected it to never be present as it should only exist within the eventually rolled-back transaction.
import org.specs2.mutable.Specification
import scalikejdbc.{DB, NamedDB}
import scalikejdbc.specs2.mutable.AutoRollback
class MyQuerySpec extends Specification with ArbitraryInput {
sequential
DBs.setup('myDB)
"creating the table" in new AutoRollback {
override def db(): DB = NamedDB('myDB).toDB()
private val tableName = s"test_${UUID.randomUUID().toString.replaceAll("-", "_")}"
private val query = new MyQuery(tableName)
query.createTable
ok
}
}
The line DBs.setup('myDB) is not part of the examples. But if I remove it I get the exception java.lang.IllegalStateException: Connection pool is not yet initialized.(name:'myDB)
The source of MyQuery.create:
SQL(s"DROP TABLE IF EXISTS $tableName").execute().apply()
SQL(s"""
|CREATE TABLE $tableName (
| id bigint PRIMARY KEY
|)""".stripMargin).execute().apply()
Config:
db {
myDB {
driver = "org.postgresql.Driver"
url = "****"
user = "****"
password = "****"
poolInitialSize = 1
poolMaxSize = 300
poolConnectionTimeoutMillis = 120000
poolValidationQuery = "select 1 as one"
poolFactoryName = "commons-dbcp2"
}
}
ScalikeJDBC v2.2.9
The MyQuery#createTable must accept implicit parameter like this:
def createTable(implicit session: DBSession)