OrientDB1.0.1 - Unable to process 34 million records

OrientDB1.0.1 - Unable to process 34 million records - orientdb

We are using OrientDB 1.0.1 and I need to process a class/table of 34 million records. On my machine with 8 GB memory, I am getting "Exception in thread "main" java.lang.OutOfMemoryError: GC overhead limit exceeded" error after inserting about 1 million records. 96% of my physical memory is occupied. Increasing JVM heap size to 8 GB, helped in processing 2 million records but it failed at 2 millions. I tried on a physical machine with 16 GB and this problem occurred after processing 3.8 million records. I need to process all 34 million records and find unique ids. The problem seems to be clearly with orientdb's buffers getting filled.
int skipRecordCount=0;
String queryStr = "select id from Table1 WHERE id is not null SKIP +Integer.toString(skipRecordCount)+" LIMIT 10000";
Set uniqueIdsSet = new TreeSet();
List idsResult = odb.db.query(new OSQLSynchQuery(queryStr));
while (!idsResult.isEmpty())
{
for (ODocument id: idsResult)
{
uniqueIdsSet.add(id.field("id").toString());
}
skipRecordCount+=10000;
queryStr = "select id from Table1 WHERE id is not null SKIP "+Integer.toString(skipRecordCount)+" LIMIT 10000";
idsResult = odb.db.query(new OSQLSynchQuery(idsQueryStr));
}

I've created a testcase for 2.2.20
I've added an index hash notunique, without null, and successfully executed all the test with 2 million records, random ids, in 3 minutes, using in memory testdb, on a 16gb system (osx), with 12gb heap max (not needed!), and process size was 4.9gb
When I changed to 34 millions the insert phase of testcase, for random ids, was completed in 33 minutes, with a process size of 7.2gb (3gb direct), then created index in 15 minutes, with a process size of 8.2gb (4gb direct), and completed quickly the testcase with add of unique ids in TreeSet
Used
"CREATE INDEX test.id NOTUNIQUE_HASH_INDEX METADATA {ignoreNullValues : true}"
and
"SELECT key FROM index:test.id WHERE key NOT IN [NULL] SKIP "+Integer.toString(skipRecordCount)+" LIMIT 10000"
import com.orientechnologies.orient.core.db.document.ODatabaseDocumentTx;
import com.orientechnologies.orient.core.metadata.schema.OClass;
import com.orientechnologies.orient.core.metadata.schema.OSchema;
import com.orientechnologies.orient.core.metadata.schema.OType;
import com.orientechnologies.orient.core.record.impl.ODocument;
import com.orientechnologies.orient.core.sql.OCommandSQL;
import com.orientechnologies.orient.core.sql.query.OSQLSynchQuery;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.Random;
public class SelectUniqueIDs {
public static void createTreeSet (ODatabaseDocumentTx db) {
int skipRecordCount=0;
String queryStr = "select id from test WHERE id is not null SKIP "+Integer.toString(skipRecordCount)+" LIMIT 10000";
Set uniqueIdsSet = new TreeSet();
List<ODocument> idsResult = db.query(new OSQLSynchQuery(queryStr));
while (!idsResult.isEmpty())
{
for (ODocument id: idsResult)
{
uniqueIdsSet.add(id.field("id").toString());
}
skipRecordCount+=10000;
// queryStr = "SELECT id FROM test WHERE id IS NOT NULL SKIP "+Integer.toString(skipRecordCount)+" LIMIT 10000";
// Using above query index on test.id is not used as we see from message, starting from skip 50000
// INFO: $ANSI{green {db=test}} [TIP] Query 'SELECT id FROM test WHERE id IS NOT NULL SKIP 50000 LIMIT 10000' fetched more than 50000 records: to speed up the execution, create an index or change the query to use an existent index
queryStr = "SELECT key FROM index:test.id WHERE key NOT IN [NULL] SKIP "+Integer.toString(skipRecordCount)+" LIMIT 10000";
idsResult = db.query(new OSQLSynchQuery(queryStr));
}
System.out.println("**** Inserted "+uniqueIdsSet.size()+" ids in uniqueIdsSet TreeSet ****");
}
public static final void main(String[] args) {
int i;
long maxMemory = Runtime.getRuntime().maxMemory();
long totalMemory = Runtime.getRuntime().totalMemory();
long freeMemory = Runtime.getRuntime().freeMemory();
System.out.println("**** Initial Heap maxMemory="+maxMemory+" , totalMemory="+totalMemory+" , freeMemory="+freeMemory+" ****");
final ODatabaseDocumentTx db = new ODatabaseDocumentTx("memory:testdb").create();
final OSchema schema = db.getMetadata().getSchema();
final OClass clazz = schema.createClass("test");
clazz.createProperty("id", OType.DOUBLE);
for (i=0 ; i < 34000000; i++){
int r =(int)(Math.random() * 100000 + 1);
db.command(new OCommandSQL("INSERT INTO test(id) VALUES ("+r+")")).execute();
}
db.command(new OCommandSQL("CREATE INDEX test.id NOTUNIQUE_HASH_INDEX METADATA {ignoreNullValues : true}")).execute();
maxMemory = Runtime.getRuntime().maxMemory();
totalMemory = Runtime.getRuntime().totalMemory();
long insertMemory = Runtime.getRuntime().freeMemory();
System.out.println("**** Inserted "+i+" ids; Heap maxMemory="+maxMemory+" , totalMemory="+totalMemory+" , freeMemory="+insertMemory+" ****");
createTreeSet(db);
final List<ODocument> count = db.query(new OSQLSynchQuery("SELECT count(*) as ids FROM test"));
Long ids = (Long) count.get(0).field("ids");
maxMemory = Runtime.getRuntime().maxMemory();
totalMemory = Runtime.getRuntime().totalMemory();
long countMemory = Runtime.getRuntime().freeMemory();
System.out.println("**** Counted "+ids+" ids; Heap maxMemory="+maxMemory+" , totalMemory="+totalMemory+" , freeMemory="+countMemory+" ****");
final List<ODocument> docs = db.query(new OSQLSynchQuery("SELECT FROM test LIMIT 100"));
for (i=0 ; i < 10; i++){
Double value = (Double) docs.get(i).field("id");
System.out.print(i+"="+value+" ");
}
System.out.println();
maxMemory = Runtime.getRuntime().maxMemory();
totalMemory = Runtime.getRuntime().totalMemory();
long selectMemory = Runtime.getRuntime().freeMemory();
System.out.println("**** Selected "+i+" ids; Heap maxMemory="+maxMemory+" , totalMemory="+totalMemory+" , freeMemory="+selectMemory+" ****");
}
}

Related

h2 database select query slow

I use an h2 database file in my Java app. The database is relatively small, 5M, on a SSD. The biggest table is about 25000 rows (25 columns). A simple select query on this table takes around 1.3 seconds. It seems very slow. The table has a primary key on ID. Here is the test code:
long tic = System.nanoTime();
final String sqlCmd = "select * from Transactions order by ID";
//final String sqlCmd = "select ID from TRANSACTIONS order by ID";
try (Statement statement = mConnection.createStatement();
ResultSet resultSet = statement.executeQuery(sqlCmd)) {
} catch (SQLException e) {
mLogger.error("SQLException " + e.getSQLState(), e);
}
long toc = System.nanoTime();
System.err.println("transactionTimingTest: " + (toc-tic)/1e6);
Repeating the same select query runs reasonably faster, about 0.2 seconds. Is there any way to improve the timing of the first run of the select query on the table?

Inserting many rows causes locking conflicts with Hibernate and Postgres, leaving the table empty

We are benchmarking some queries to see if they will still work reliably for "a lot of" data. (1 million isn't that much to be honest, but Postgres already fails here, so it evidently is.)
Our Java code to call this queries looks something like that:
#PersistenceContext
private EntityManager em;
#Resource
private UserTransaction utx;
for (int i = 0; i < 20; i++) {
this.utx.begin();
for (int inserts = 0; inserts < 50_000; inserts ++) {
em.createNativeQuery(SQL_INSERT).executeUpdate();
}
this.utx.commit();
for (int parameter = 0; parameter < 25; parameter ++)
long time = System.currentTimeMillis();
Assert.assertNotNull(this.em.createNativeQuery(SQL_SELECT).getResultList());
System.out.println(i + " iterations \t" + parameter + "\t" + (System.currentTimeMillis() - time) + "ms");
}
}
Or with plain JDBC:
Connection connection = //...
for (int i = 0; i < 20; i++) {
for (int inserts = 0; inserts < 50_000; inserts ++) {
try (Statement statement = connection.createStatement();) {
statement.execute(SQL_INSERT);
}
}
for (int parameter = 0; parameter < 25; parameter ++)
long time = System.currentTimeMillis();
try (Statement statement = connection.createStatement();) {
statement.execute(SQL_SELECT);
}
System.out.println(i + " iterations \t" + parameter + "\t" + (System.currentTimeMillis() - time) + "ms");
}
}
The queries we tried were a simple INSERT into a table with JSON and a INSERT over two tables with about 25 lines. The SELECT has one or two JOINs and is pretty easy. One set of queries is (I had to anonymize the SQL else I wouldn't have been allowed to post it):
CREATE TABLE ts1.p (
id integer NOT NULL,
CONSTRAINT p_pkey PRIMARY KEY ("id")
);
CREATE TABLE ts1.m(
pId integer NOT NULL,
mId character varying(100) NOT NULL,
a1 character varying(50),
a2 character varying(50),
CONSTRAINT m_pkey PRIMARY KEY (pI, mId)
);
CREATE SEQUENCE ts1.seq_p;
/*
* SQL_INSERT
*/
WITH p AS (
INSERT INTO ts1.p (id)
VALUES (nextval('ts1.seq_p'))
RETURNING id AS pId
)
INSERT INTO ts1.m(pId, mId, a1, a2)
VALUES ((SELECT pId from p), 'M1', '11', '12'),
((SELECT pId from p), 'M2', '13', '14'),
/* ... about 20 to 25 rows of values */
/*
* SQL_SELECT
*/
WITH userInput (mId, a1, a2) AS (
VALUES
('M1', '11', '11'),
('M2', '12', '15'),
/* ... about "parameter" rows of values */
)
SELECT m.pId, COUNT(m.a1) AS matches
FROM userInput u
LEFT JOIN ts1.m m ON (m.mId) = (u.mId)
WHERE (m.a1 IS NOT DISTINCT FROM u.a1) AND
(m.a2 IS NOT DISTINCT FROM u.a2) OR
(m.a1 IS NULL AND m.a2 IS NULL)
GROUP BY m.pId
/* plus HAVING, additional WHERE clauses etc. according to the use case, but that just speeds up the query */
When executing, we get the following output (the values are supposed to rise steadly and linearly):
271ms
414ms
602ms
820ms
995ms
1192ms
1396ms
1594ms
1808ms
1959ms
110ms
33ms
14ms
10ms
11ms
10ms
21ms
8ms
13ms
10ms
As you can see, after some value (usually at around 300,000 to 500,000 inserts) the time needed for the query drops significantly. Sadly we can't really debug what the result is at that point (other than that it's not null), but we assume it's an empty list, because the database tables are empty.
Let me repeat that: After half a million INSERTS, Postgres clears tables.
Of course that's not acceptable at all.
We tried different queries, all of easy to medium difficulty, and all produced this behavior, so we assume it's not the queries.
We thought that maybe the sequence returned a value too high for a column integer, so we droped and recreated the sequence.
Once there was this exception:
org.postgresql.util.PSQLException : FEHLER: Verklemmung (Deadlock) entdeckt
Detail: Prozess 1620 wartet auf AccessExclusiveLock-Sperre auf Relation 2001098 der Datenbank 1937678; blockiert von Prozess 2480.
Which I'm entirely unable to translate. I guess it's something like:
org.postgresql.util.PSQLException : ERROR: Jamming? Clamping? Constipation? (Deadlock) found
But I don't think this error has anything to do with the clearing of the table. We just tested against the wrong database, so multiple queries were run on the same table. Normally we have one database per benchmark test.
Of course it's important that we find out what the error is, so that we can decide if there is any risk to our customers losing their data (because again, on error the database empties some table of its choice).
Postgres version: PostgreSQL 10.6, compiled by Visual C++ build 1800, 64-bit
We tried PostgreSQL 9.6.11, compiled by Visual C++ build 1800, 64-bit, too. And we never had the same problem there (even though that could just be luck, since it's not 100% reproducible).
Do you have any idea what the error is? Or how we could debug it? The entire benchmark test runs for an hour, so there is no immediate feedback.

How to search numeric by Sphinx correctly?

I need make search on billion records in MySQL and it's very long process (it's works now). May be Sphinx help me? How correctly to configure Sphinx for search numbers? Should I use integer attribute for searching (not string field)?
I need to get only row where the timestamp 'nearest or equal' to query:
CREATE TABLE test ( date TIMESTAMP(6) UNIQUE, num INT(32) );
| 2018-07-02 05:50:33.084011 | 282 |
| 2018-07-02 05:50:33.084028 | 475 |
...
(40 M such rows... all timestamps is unique, so this column are unique index so I no need in create additional index I suppose.)
sphinx.conf:
source src1
{
type = mysql
...
sql_query = SELECT * FROM test
}
indexer...
Sphinx 3.0.3
...
indexing index 'test'...
collected 40000000 docs, 0.0 MB
In my test I find nearest timestamp to query:
$start = microtime(true);
$query = '2018-07-02 05:50:33.084011';
$connMySQL = new PDO('mysql:host=localhost;dbname=test','','');
$sql = "SELECT * FROM test WHERE date <= '$search' ORDER BY date DESC LIMIT 1";
$que = $connMySQL->query($sql);
$result = $que->fetchAll(PDO::FETCH_ASSOC);
$query = $connMySQL->query('reset query cache');
$connMySQL = null;
print_r ($result);
echo 'Time MySQL:'.(microtime(true) - $start).' sec.';
$start = microtime(true);
$query = '2018-07-02 05:50:33.084029';
$connSphinxQL = new PDO('mysql:host=localhost;port=9306;dbname=test','root','');
$sql = "SELECT * FROM test WHERE date <= '$search' ORDER BY date DESC LIMIT 1";
$que = $connSphinxQL->query($sql);
$result = $que->fetchAll(PDO::FETCH_ASSOC);
$query = $connSphinxQL->query('reset query cache');
$connSphinxQL = null;
print_r ($result);
echo 'Time Sphinx:'.(microtime(true) - $start).' sec.';
Output:
[date] => 2018-07-02 05:50:33.084011 [num] => 282 Time MySQL: 0.00193 sec.
[date] => 2018-07-02 05:50:33.084028 [num] => 475 Time Sphinx: 0.00184 sec.
I suggested to see some different resuts, but noticed that before indexing I have got the same result, so I think Sphinx searches directy in MySQL by the reason of my wrong configuration.
Only ask here I found: no text search

Should I use integer attribute for searching (not string field)?
Yes. But an added complication, is a index NEEDS at least one field (sphinx isnt really designed as a general database, its intended for text queries!)
Can synthesize a fake one.
sql_query = SELECT unix_timestamp(`date`) AS id, 'a' AS field, num FROM test
sql_attr_uint = num
Also shows that need a unique integer as the first column, to be a document_id, seems as your timestamp is unique, can use that. a UNIX_TIMESTAMP is a nice easy way to represent a timestamp as a plain integer.
Can use id in queries too, for filtering, so would need to convert to a timestamp at the same time.
$query = '2018-07-02 05:50:33.084011';
$id = strtotime($query)
$sql = "SELECT * FROM test WHERE id <= '$id' ORDER BY id DESC LIMIT 1";

PreparedStatement slower than Statement with JDBC

I am currently working on weather monitoring.
For example a record of temperature has a date and a location (coordinates).
All of the coordinates are already in the database, what I need to add is time and the value of the temperature. Values and metadata are in a CSV file.
Basically what I'm doing is:
Get time through the file's name
Insert time into DB, and keep the primary key
Reading file, get the value and coordinates
Select query to get the id of the coordinates
Insert weather value with foreign keys (time and coordinates)
The issue is that the
"SELECT id FROM location WHERE latitude = ... AND longitude = ..."
is too slow. I have got 230k files and currently one file takes more than 2 minutes to be processed... Edit: by changing the index, it now takes 25 seconds and is still too slow. Moreover, the PreparedStatement is also still slower and I cannot figure out why.
private static void putFileIntoDB(String variableName, ArrayList<String[]> matrix, File file, PreparedStatement prepWeather, PreparedStatement prepLoc, PreparedStatement prepTime, Connection conn){
try {
int col = matrix.size();
int row = matrix.get(0).length;
String ts = getTimestamp(file);
Time time = getTime(ts);
// INSERT INTO takes 14ms
prepTime.setInt(1, time.year);
prepTime.setInt(2, time.month);
prepTime.setInt(3, time.day);
prepTime.setInt(4, time.hour);
ResultSet rs = prepTime.executeQuery();
rs.next();
int id_time = rs.getInt(1);
//for each column (longitude)
for(int i = 1 ; i < col ; ++i){
// for each row (latitude)
for(int j = 1 ; j < row ; ++j){
try {
String lon = matrix.get(i)[0];
String lat = matrix.get(0)[j];
String var = matrix.get(i)[j];
lat = lat.substring(1, lat.length()-1);
lon = lon.substring(1, lon.length()-1);
double latitude = Double.parseDouble(lat);
double longitude = Double.parseDouble(lon);
double value = Double.parseDouble(var);
// With this prepared statement, instruction needs 16ms to be executed
prepLoc.setDouble(1, latitude);
prepLoc.setDouble(2, longitude);
ResultSet rsLoc = prepLoc.executeQuery();
rsLoc.next();
int id_loc = rsLoc.getInt(1);
// Whereas this block takes 1ms
Statement stm = conn.createStatement();
ResultSet rsLoc = stm.executeQuery("SELECT id from location WHERE latitude = " + latitude + " AND longitude =" + longitude + ";" );
rsLoc.next();
int id_loc = rsLoc.getInt(1);
// INSERT INTO takes 1ms
prepWeather.setObject(1, id_time);
prepWeather.setObject(2, id_loc);
prepWeather.setObject(3, value);
prepWeather.execute();
} catch (SQLException ex) {
Logger.getLogger(ECMWFHelper.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
} catch (SQLException ex) {
Logger.getLogger(ECMWFHelper.class.getName()).log(Level.SEVERE, null, ex);
}
}
What I already did:
Set two B-tree index on table location on columns latitude and longitude
Drop foreign keys constraints
PreparedStatements in parameters are :
// Prepare selection for weather_radar foreign key
PreparedStatement prepLoc = conn.prepareStatement("SELECT id from location WHERE latitude = ? AND longitude = ?;");
PreparedStatement prepTime = conn.prepareStatement("INSERT INTO time(dataSetID, year, month, day, hour) " +
"VALUES(" + dataSetID +", ?, ? , ?, ?)" +
" RETURNING id;");
// PrepareStatement for weather_radar table
PreparedStatement prepWeather = conn.prepareStatement("INSERT INTO weather_radar(dataSetID, id_1, id_2, " + variableName + ")"
+ "VALUES(" + dataSetID + ", ?, ?, ?)");
Any idea to get things go quicker?
Ubuntu 16.04 LTS 64-bits
15.5 Gio
Intel® Core™ i7-6500U CPU # 2.50GHz × 4
PostgreSQL 9.5.11 on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 5.4.0-6ubuntu1~16.04.4) 5.4.0 20160609, 64-bit
Netbeans IDE 8.2
JDK 1.8
postgresql-42.2.0.jar

The key issue you have here is you miss ResultSet.close() and Statement.close() kind of calls.
As you resolve that (add relevant close calls) you might find that having SINGLE con.prepareStatement call (before both for loops) would improve the performance even further (of course, you will not need to close the statement in a loop, however you still would need to close resultsets in a loop).
Then you might apply batch SQL

Using EXPLAIN, the point where query becomes latent could be figured out.
One of the situation where I have encountered case alike being:
Compound queries e.g. parameterized similar date ranges, from different tables and then joining them on some indexed value. Even if the date in the above serve as index still the query produced in preparedStatement, could not hit the indexes and ended up doing a scan over the joining data.

ESPER: 'Partition by' CLAUSE ERROR

The issue that I have is using the clause 'partition by' in 'Match Recognize', the 'partition by' clause seems to support just 99 different events because when I have 100 or more different events it does not group correctly. to test this I have the following EPL query:
select * from TemperatureSensorEvent
match_recognize (
partition by id
measures A.id as a_id, A.temperature as a_temperature
pattern (A)
define
A as prev(A.id) is null
)
I am using this query basically to get the first event (first temperature) of each device, however testing with 10, 20, 50, ... 99 different devices it works fine but when I have more than 99, it seems that ESPER resets all the events send before the device with id=100, and if I send a event that is of the device with id=001, ESPER takes it as if it was the first event.
it seems that 'partition by' just supports 99 different events and if you add one more the EPL is reset or something like that. Is it a restriction that 'partition by' clause has?, how I can increase this threshold because I have more than 100 devices?.
ESPER version: 5.1.0
Thanks in advance
Demo Class:
public class EsperDemo
{
public static void main(String[] args)
{
Configuration config = new Configuration();
config.addEventType("TemperatureSensorEvent", TemperatureSensorEvent.class.getName());
EPServiceProvider esperProvider = EPServiceProviderManager.getProvider("EsperDemoEngine", config);
EPAdministrator administrator = esperProvider.getEPAdministrator();
EPRuntime esperRuntime = esperProvider.getEPRuntime();
// query to get the first event of each temperature sensor
String query = "select * from TemperatureSensorEvent "
+ "match_recognize ( "
+ " partition by id "
+ " measures A.id as a_id, A.temperature as a_temperature "
+ " after match skip to next row "
+ " pattern (A) "
+ " define "
+ " A as prev(A.id) is null "
+ ")";
TemperatureSubscriber temperatureSubscriber = new TemperatureSubscriber();
EPStatement cepStatement = administrator.createEPL(query);
cepStatement.setSubscriber(temperatureSubscriber);
TemperatureSensorEvent temperature;
Random random = new Random();
int sensorsQuantity = 100; // it works fine until 99 sensors
for (int i = 1; i <= sensorsQuantity; i++) {
temperature = new TemperatureSensorEvent(i, random.nextInt(20));
System.out.println("Sending temperature: " + temperature.toString());
esperRuntime.sendEvent(temperature);
}
temperature = new TemperatureSensorEvent(1, 64);
System.out.println("Sending temperature: sensor with id=1 again: " + temperature.toString());
esperRuntime.sendEvent(temperature);
}
}

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

OrientDB1.0.1 - Unable to process 34 million records - orientdb

Related

h2 database select query slow

Inserting many rows causes locking conflicts with Hibernate and Postgres, leaving the table empty

How to search numeric by Sphinx correctly?

PreparedStatement slower than Statement with JDBC

ESPER: 'Partition by' CLAUSE ERROR

Categories

Resources