How do i transfer data from Elastic Search to Postgres? - postgresql

I have a huge data in elastic search and i want to write a script to create a table corresponding to a particular index and transfer all the data to postgres.

Nevermind, I got my answer. What i did was to
create a connection with postgres and elastic search
create table in postgresql
store data in chunks of 10k in a list of dictionary.
transfer data from that list of dictionary in the postgresql and then empty the list for next iteration.
import psycopg2
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from collections import defaultdict
dict = defaultdict(list)
t_host = "localhost"
t_port = "9200"
t_dbname_ES = "companydatabase" #index
t_user = "elastic"
t_pw = "changeme"
client_ES = Elasticsearch([t_host],http_auth=(t_user, t_pw),port=t_port)
t_host = "localhost"
t_port = "5999"
t_dbname = "postgres"
t_user = "postgres"
t_pw = "postgres"
db_conn = psycopg2.connect(host=t_host, port=t_port, dbname=t_dbname, user=t_user, password=t_pw)
db_cursor = db_conn.cursor()
column_name_list = ["Address","Age","DateOfJoining","Designation","FirstName","Gender","Interests","LastName","MaritalStatus","Salary"]
column_type_list = ["text not null","integer","date","text","text","text","text","text","text","text","text","text","integer"]
table_name = 'sample_table2' #table name to insert data into
column_names = ', '.join(column_name_list)
column_types = ", ".join(column_type_list)
#table creation
create_table_query = "CREATE TABLE {} (".format(table_name)
for i in range(len(column_name_list)):
create_table_query += column_name_list[i]
create_table_query += " "
create_table_query += column_type_list[i]
if i != len(column_name_list) - 1:
create_table_query += ", "
create_table_query += ");"
try:
db_cursor.execute(create_table_query)
db_conn.commit()
except psycopg2.Error as e:
t_message = "Database error: " + e
#data insertion
s = Search(index=t_dbname_ES).using(client_ES).query("match_all")
total_documents = s.count() #total count of records in the index
count=0
for hit in s.scan(): #looping over all records one at a time
count+=1
total_documents -=1
for i in range(len(column_name_list)): #appending the data fethed from document in a list of dictionary.
dict[column_name_list[i]].append(hit[column_name_list[i]])
if count==10000 or total_documents==0: #appending data in postgres 10k records at a time
insert_query = "INSERT INTO "+table_name+" (" + column_names + ")"+" VALUES"
for i in range(min(10000,count)):
insert_query += "("
for j in range(len(column_name_list)):
if j!=0:
insert_query+=', '+ "'"+str(dict[column_name_list[j]][i])+"'"
else:
insert_query+="'"+str(dict[column_name_list[j]][i])+"'"
insert_query += "),"
insert_query= insert_query[:-1]
insert_query += ";"
for i in range(len(column_name_list)): #making the list empty for next iteration of 10k records
dict[column_name_list[i]]=[]
try:
db_cursor.execute(insert_query)
db_conn.commit()
count=0
except psycopg2.Error as e:
t_message = "Database error: " + e
db_cursor.close()
db_conn.close()

Related

Recording a list in Postgres

I have a csv file with the content:
COD_DIST;ID
11;\['42341', '42340', '42342'\]
To import this csv to Postgres i use the following code:
conn = psycopg2.connect(host='x.x.x.x',
database='BASENAME_ANY',
user=user,
password=password)
curs = conn.cursor()
with open(filename, "r", encoding='utf-8') as fp:
#Skip the header row.
next(fp)
tup = [tuple(line.strip().split(";")) for line in fp]
print(tup)
args_str = ",".join(curs.mogrify("(%s,%s)", x).decode('utf-8') for x in tup)
sqlDec = "INSERT INTO " + name_table + " VALUES "
curs.execute(sqlDec + args_str)
conn.commit()
curs.close()
conn.close()
I received this message: DETAIL: "\[" must introduce explicitly-specified array dimensions....
Please, can you help me?

Unable to insert data in PostgreSQL 11.0 table

I would like to insert values into postgresql 11.0 table. However, when I am trying to do that I am getting following error:
TypeError: not all arguments converted during string formatting
I am running following code:
#CREATE TABLE
try:
connect_str = "dbname='xx' user='xx' host='xx' " "password='xx' port = xx"
conn = psycopg2.connect(connect_str)
except:
print("Unable to connect to the database")
cursor = conn.cursor()
cursor.execute("""DROP TABLE IF EXISTS tbl""")
try:
cursor.execute("""
CREATE TABLE IF NOT EXISTS tbl(
entry_id CHARACTER VARYING NOT NULL,
name CHARACTER VARYING NOT NULL,
class CHARACTER VARYING NOT NULL,
ko_id CHARACTER VARYING NOT NULL,
PRIMARY KEY (entry_id))
""")
except:
print("The table cannot be created!")
conn.commit()
conn.close()
cursor.close()
#INSERT DATA INTO TABLE
try:
connect_str = "dbname='xx' user='xx' host='xx' " "password='xx' port = xx"
conn = psycopg2.connect(connect_str)
except:
print("Unable to connect to the database")
cursor = conn.cursor()
with open ('file.txt') as f:
for line in f:
if re.match('^[A-Z]+',line) and line.startswith("ENTRY") or line.startswith("NAME") or line.startswith("CLASS") or line.startswith("KO_PATHWAY"):
key, value = line.split(" ", 1)
#print (key, value)
if key == "ENTRY":
cursor.execute("INSERT INTO tbl (entry_id) VALUES (%s)",('value'))
conn.commit()
conn.close()
cursor.close()
The key-value looks like this:
ENTRY map00010 Pathway
NAME Glycolysis / Gluconeogenesis
CLASS Metabolism; Carbohydrate metabolism
KO_PATHWAY ko00010
ENTRY map00011 Pathway
NAME Glycolysis
CLASS Metabolism; Carbohydrate
KO_PATHWAY ko00011
The value map00010 Pathway and map00011 Pathway should be inserted in the table and create two rows.
Any help is highly appreciated.

Null values in qtablewidget

I develop a desktop application on PYQT5 with integration of posgtresql. I stumbled over the situation that the table does not display the values:
source code:
def createTable(self):
self.tableWidget = QTableWidget()
conn = psycopg2.connect('host=localhost port=5432 dbname=postgres user=postgres password=12345678')
cursor = conn.cursor()
query = cursor.execute("SELECT * FROM wave_params")
result = cursor.fetchall()
for i in result:
print(i)
rows = len(result)
columns = len(result[0])
self.tableWidget.setColumnCount(columns)
self.tableWidget.setRowCount(rows)
index = 0
while query != None:
self.tableWidget.setItem(index,0, QTableWidgetItem(query.result[0]))
# self.tableWidget.setItem(index, 1, QTableWidgetItem(str(query.value(1))))
# self.tableWidget.setItem(index, 2, QTableWidgetItem(str(query.value(2))))
index = index + 1
# table selection change
self.tableWidget.doubleClicked.connect(self.on_click)
#pyqtSlot()
def on_click(self):
print("\n")
for currentQTableWidgetItem in self.tableWidget.selectedItems():
print(currentQTableWidgetItem.row(), currentQTableWidgetItem.column(), currentQTableWidgetItem.text())
I can not understand. what is the problem?
Thanks!
the cursor object has no attribute result.
in your code result is a list of tuples containing the return value of cursor.fetchall() and query is the return value of cursor.execute(). cursor.execute() returns always None
(see documentation). You only need to loop over result, here 2 examples:
def createTable(self):
self.tableWidget = QTableWidget(self)
psycopg2.connect('host=localhost port=5432 dbname=postgres user=postgres password=12345678')
cursor = conn.cursor()
query = cursor.execute("SELECT * FROM ladestelle")
result = cursor.fetchall()
rows = len(result)
columns = len(result[0])
self.tableWidget.setColumnCount(columns)
self.tableWidget.setRowCount(rows)
for i, r in enumerate(result):
self.tableWidget.setItem(i, 0, QTableWidgetItem(r[0]))
self.tableWidget.setItem(i, 1, QTableWidgetItem(str(r[1])))
self.tableWidget.setItem(i, 2, QTableWidgetItem(str(r[2])))
'''
# or simpler
for r in range(rows):
for c in range(columns):
self.tableWidget.setItem(r, c, QTableWidgetItem(str(result[r][c])))
'''
self.tableWidget.doubleClicked.connect(self.on_click)

SQlAlchemy bulk insert is taking too much time for postgresql connection string

While using the bulk insertion code as given in the performance link in SQLAlchemy http://docs.sqlalchemy.org/en/latest/faq/performance.html , the sqlite works fine and takes time as described in their document. While using the same code for the postgresql connection string . The total time is multiplied by many times.
Is there any way to make it faster in postgresql? What i am doing wrong here ??
Especially bulk_insert_mappings and bulk_save_objects, which are my only options to insert 370,000 rows.
Postgresql connection string
connection_string = 'postgresql://' + conf.DB_USER + ':' + conf.DB_PASSWORD + '#' + \
conf.DB_HOST + ':' + conf.DB_PORT + '/' + conf.DB_NAME
Code used for checking performance :
import time
import sqlite3
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.orm import scoped_session, sessionmaker
Base = declarative_base()
DBSession = scoped_session(sessionmaker())
engine = None
class Customer(Base):
__tablename__ = "customer"
id = Column(Integer, primary_key=True)
name = Column(String(255))
def init_sqlalchemy(dbname='sqlite:///sqlalchemy.db'):
global engine
connection_string = 'postgresql://' + 'scott' + ':' + 'tiger' + '#' + \
'localhost' + ':' + '5432' + '/' + 'test_db'
engine = create_engine(connection_string, echo=False)
DBSession.remove()
DBSession.configure(bind=engine, autoflush=False, expire_on_commit=False)
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
def test_sqlalchemy_orm(n=100000):
init_sqlalchemy()
t0 = time.time()
for i in xrange(n):
customer = Customer()
customer.name = 'NAME ' + str(i)
DBSession.add(customer)
if i % 1000 == 0:
DBSession.flush()
DBSession.commit()
print(
"SQLAlchemy ORM: Total time for " + str(n) +
" records " + str(time.time() - t0) + " secs")
def test_sqlalchemy_orm_pk_given(n=100000):
init_sqlalchemy()
t0 = time.time()
for i in xrange(n):
customer = Customer(id=i+1, name="NAME " + str(i))
DBSession.add(customer)
if i % 1000 == 0:
DBSession.flush()
DBSession.commit()
print(
"SQLAlchemy ORM pk given: Total time for " + str(n) +
" records " + str(time.time() - t0) + " secs")
def test_sqlalchemy_orm_bulk_save_objects(n=100000):
init_sqlalchemy()
t0 = time.time()
n1 = n
while n1 > 0:
n1 = n1 - 10000
DBSession.bulk_save_objects(
[
Customer(name="NAME " + str(i))
for i in xrange(min(10000, n1))
]
)
DBSession.commit()
print(
"SQLAlchemy ORM bulk_save_objects(): Total time for " + str(n) +
" records " + str(time.time() - t0) + " secs")
def test_sqlalchemy_orm_bulk_insert(n=100000):
init_sqlalchemy()
t0 = time.time()
n1 = n
while n1 > 0:
n1 = n1 - 10000
DBSession.bulk_insert_mappings(
Customer,
[
dict(name="NAME " + str(i))
for i in xrange(min(10000, n1))
]
)
DBSession.commit()
print(
"SQLAlchemy ORM bulk_insert_mappings(): Total time for " + str(n) +
" records " + str(time.time() - t0) + " secs")
def test_sqlalchemy_core(n=100000):
init_sqlalchemy()
t0 = time.time()
engine.execute(
Customer.__table__.insert(),
[{"name": 'NAME ' + str(i)} for i in xrange(n)]
)
print(
"SQLAlchemy Core: Total time for " + str(n) +
" records " + str(time.time() - t0) + " secs")
def init_sqlite3(dbname):
conn = sqlite3.connect(dbname)
c = conn.cursor()
c.execute("DROP TABLE IF EXISTS customer")
c.execute(
"CREATE TABLE customer (id INTEGER NOT NULL, "
"name VARCHAR(255), PRIMARY KEY(id))")
conn.commit()
return conn
def test_sqlite3(n=100000, dbname='sqlite3.db'):
conn = init_sqlite3(dbname)
c = conn.cursor()
t0 = time.time()
for i in xrange(n):
row = ('NAME ' + str(i),)
c.execute("INSERT INTO customer (name) VALUES (?)", row)
conn.commit()
print(
"sqlite3: Total time for " + str(n) +
" records " + str(time.time() - t0) + " sec")
if __name__ == '__main__':
test_sqlalchemy_orm(100000)
test_sqlalchemy_orm_pk_given(100000)
test_sqlalchemy_orm_bulk_save_objects(100000)
test_sqlalchemy_orm_bulk_insert(100000)
test_sqlalchemy_core(100000)
test_sqlite3(100000)
Output :
SQLAlchemy ORM: Total time for 100000 records 40.6781959534 secs
SQLAlchemy ORM pk given: Total time for 100000 records 21.0855250359 secs
SQLAlchemy ORM bulk_save_objects(): Total time for 100000 records 14.068707943 secs
SQLAlchemy ORM bulk_insert_mappings(): Total time for 100000 records 11.6551070213 secs
SQLAlchemy Core: Total time for 100000 records 12.5298728943 secs
sqlite3: Total time for 100000 records 0.477468013763 sec
Using the original connection string (i.e. sqlite):
engine = create_engine(dbname, echo=False)
Output :
SQLAlchemy ORM: Total time for 100000 records 16.9145789146 secs
SQLAlchemy ORM pk given: Total time for 100000 records 10.2713520527 secs
SQLAlchemy ORM bulk_save_objects(): Total time for 100000 records 3.69206118584 secs
SQLAlchemy ORM bulk_insert_mappings(): Total time for 100000 records 1.00701212883 secs
SQLAlchemy Core: Total time for 100000 records 0.467703104019 secs
sqlite3: Total time for 100000 records 0.566409826279 sec
The fastest method is to use COPY FROM (see SQLAlchemy, Psycopg2 and Postgresql COPY) but if you do NOT have write permissions, e.g. deploying to Heroku, then you can leverage Psycopg2 Fast Execution Helpers.
For example, for bulk or core insert, the following:
engine = create_engine(
"postgresql+psycopg2://scott:tiger#host/dbname",
executemany_mode='values',
executemany_values_page_size=10000)
brings the timings to:
SQLAlchemy ORM bulk_save_objects(): Total time for 100000 records 2.796818971633911 secs
SQLAlchemy ORM bulk_insert_mappings(): Total time for 100000 records 1.3805248737335205 secs
SQLAlchemy Core: Total time for 100000 records 1.1153180599212646 secs
Instead of
SQLAlchemy ORM bulk_save_objects(): Total time for 100000 records 9.02771282196045 secs
SQLAlchemy ORM bulk_insert_mappings(): Total time for 100000 records 7.643821716308594 secs
SQLAlchemy Core: Total time for 100000 records 7.460561275482178 secs

passing numeric where parameter condition in postgress using python

I am trying to use postgresql in Python.The query is against a numeric field value in the where condition. The result set is not fetching and giving error ("psycopg2.ProgrammingError: no results to fetch").There are records in the database with agent_id (integer field) > 1.
import psycopg2
# Try to connect
try:
conn=psycopg2.connect("dbname='postgres' host ='localhost')
except:
print "Error connect to the database."
cur = conn.cursor()
agentid = 10000
try:
sql = 'SELECT * from agent where agent_id > %s::integer;'
data = agentid
cur.execute(sql,data)
except:
print "Select error"
rows = cur.fetchall()
print "\nRows: \n"`
for row in rows:``
print " ", row[9]
Perhaps try these things in your code:
conn=psycopg2.connect("dbname=postgres host=localhost user=user_here password=password_here port=port_num_here")
sql = 'SELECT * from agent where agent_id > %s;'
data = (agentid,) # A single element tuple.
then use
cur.execute(sql,data)
Also, I am confused here to what you want to do with this code
for row in rows:``
print " ", row[9]
Do you want to print each row in rows or just the 8th index of rows, from
rows = cur.fetchall()
If you wanted that index, you could
print rows[9]