Related
To configure the current pokec db ./oetl.sh I'm trying. However, the pipeline exit keeps occurring. I don't know what the problem is. Help me.The dburl is also a code that was well written but is now erased. I've tried all the settings, but they can't. If you do this, the data will be good, but it will not shut down automatically.
{
"config": {
"parallel": true
},
"source": {
"file": {
"path": "/home/yuna/soc-pokec-profiles.txt",
"lock" : true ,
"encoding" : "UTF-8"
}
},
"extractor": { "row": {} },
"transformers": [
{ "csv": {"columns":["id","public","completion_percentage","gender","region","region2","last_login","registration","AGE","body","I_am_working_in_field","spoken_languages","hobbies","I_most_enjoy_good_food","pets","body_type","my_eyesight","eye_color","hair_color","hair_type","completed_level_of_education","favourite_color","relation_to_smoking","relation_to_alcohol","on_pokec_i_am_looking_for","love_is_for_me","relation_to_casual_sex","my_partner_should_be","marital_status","children","relation_to_children","I_like_movies","I_like_watching_movie","I_like_music","I_mostly_like_listening_to_music","the_idea_of_good_evening","I_like_specialties_from_kitchen","fun","I_am_going_to_concerts","my_active_sports","my_passive_sports","profession","I_like_books","life_style","music","cars","politics","relationships","art_culture","hobbies_interests","science_technologies","computers_internet","education","sport","movies","travelling","health","companies_brands","more"],"separator": "/t","nullValue": "NULL"} },
{ "vertex": { "class": "Profile"} },
{"field":
{ "fieldName" : "id",
"expression" : "id.prefix('P')"
}
},
{"field":
{ "fieldNames" :
["region2","body","I_am_working_in_field","spoken_languages","hobbies","I_most_enjoy_good_food","pets","body_type","my_eyesight","eye_color","hair_color","hair_type","completed_level_of_education","favourite_color","relation_to_smoking","relation_to_alcohol","on_pokec_i_am_looking_for","love_is_for_me","relation_to_casual_sex","my_partner_should_be","marital_status","children","relation_to_children","I_like_movies","I_like_watching_movie","I_like_music","I_mostly_like_listening_to_music","the_idea_of_good_evening","I_like_specialties_from_kitchen","fun","I_am_going_to_concerts","my_active_sports","my_passive_sports","profession","I_like_books","life_style","music","cars","politics","relationships","art_culture","hobbies_interests","science_technologies","computers_internet","education","sport","movies","travelling","health","companies_brands","more"],
"operation": "remove"
}
}
],
"loader": {
"orientdb": {
"dbURL": "",
"dbType": "graph",
"wal": false,
"batchCommit": 10000,
"useLightweightEdges" : true,
"dbAutoCreateProperties": true,
"classes": [
{"name": "Profile", "extends": "V", "clusters": 3},
{"name": "Relation", "extends": "E"}
], "indexes": [
{"class":"Profile", "fields":["id:string"], "type":"UNIQUE" }
],
"settings": {
}
}
}
I have some data that is in 2 CSV files, one contains the vertices and the other file contains the edges are in the other file. I'm working out how to set this up using ETL and am close but not quite there yet--it mostly works but my edges have properties and I'm not sure that they're loading right. This question was helpful but I'm still missing something...
Here's my data:
vertices.csv:
label,data,date
v01,0.1234,2015-01-01
v02,0.5678,2015-01-02
v03,0.9012,2015-01-03
edges.csv:
u,v,weight,date
v01,v02,12.4,2015-06-17
v02,v03,17.9,2015-09-14
I import my vertices using this:
commonVertices.json:
{
"begin": [
{ "let": { "name": "$filePath",
"expression": "$fileDirectory.append($fileName)"
}
},
],
"config": { "log": "info"},
"source": { "file": { "path": "$filePath" } },
"extractor": { "csv": { "ignoreEmptyLines": true,
"nullValue": "N/A",
"dateFormat": "yyyy-mm-dd"
}
},
"transformers": [
{ "vertex": { "class": "myVertex" } },
{ "code": { "language": "Javascript",
"code": "print(' Current record: ' + record); record;" }
}
],
"loader": { "orientdb": {
"dbURL": "plocal:my_orientdb",
"dbType": "graph",
"batchCommit": 1000,
"classes": [ { "name": "myVertex", "extends", "V" },
],
"indexes": []
}
}
}
vertices.json:
{ "config": { "log": "info",
"fileDirectory": "./",
"fileName": "vertices.csv"
}
}
commonEdges.json:
{
"begin": [
{ "let": { "name": "$filePath",
"expression": "$fileDirectory.append($fileName )"
}
},
],
"config": { "log": "info"
},
"source": { "file": { "path": "$filePath" } },
"extractor": { "csv": { "ignoreEmptyLines": true,
"nullValue": "N/A",
"dateFormat": "yyyy-mm-dd"
}
},
"transformers": [
{ "merge": { "joinFieldName": "u", "lookup": "myVertex.label" } },
{ "edge": { "class": "myEdge",
"joinFieldName": "v",
"lookup": "myVertex.label",
"direction": "out",
"unresolvedLinkAction": "NOTHING"
}
},
{ "field": { "fieldNames": ["u", "v"], "operation": "remove" } }
],
"loader": {
"orientdb": {
"dbURL": "plocal:my_orientdb",
"dbType": "graph",
"batchCommit": 1000,
"useLightweightEdges": false,
"classes": [
{ "name": "myEdge", "extends", "E" }
],
"indexes": []
}
}
}
edges.json:
{
"config": {
"log": "info",
"fileDirectory": "./",
"fileName": "edges.csv"
}
}
I am running it with oetl.sh like this:
$ oetl.sh vertices.json commonVertices.json
$ oetl.sh edges.json commonEdges.json
Everything runs, but when I query the edges... I'm new to OrientDB, so maybe it is getting the properties in my edges, but when I query the edges I don't see the weight and date fields:
orientdb {db=my_orientdb}> SELECT FROM myEdge
+----+-----+------+-----+-----+
|# |#RID |#CLASS|out |in |
+----+-----+------+-----+-----+
|0 |#33:0|myEdge|#25:0|#26:0|
|1 |#34:0|myEdge|#26:0|#27:0|
+----+-----+------+-----+-----+
The vertex table contains the [weight] field from my edges.csv and the [date] field is getting clobbered in a weird way. The day of the month is getting overwritten to the day from the edge.csv file, which is undesirable, but it's odd to me that the month itself isn't also getting change:
orientdb {db=my_orientdb}> SELECT FROM myVertex
+----+-----+--------+------+-------------------+-----+------+----------+---------+
|# |#RID |#CLASS |data |date |label|weight|out_myEdge|in_myEdge|
+----+-----+--------+------+-------------------+-----+------+----------+---------+
|0 |#25:0|myVertex|0.1234|2015-01-17 00:06:00|v01 |12.4 |[#33:0] | |
|1 |#26:0|myVertex|0.5678|2015-01-14 00:09:00|v02 |17.9 |[#34:0] |[#33:0] |
|2 |#27:0|myVertex|0.9012|2015-01-03 00:01:00|v03 | | |[#34:0] |
+----+-----+--------+------+-------------------+-----+------+----------+---------+
I'm sure it's probably a simple tweak, any help would be great!
In edge transformer use edgeFields to bind properties in edges. Example:
"transformers": [
{ "merge": { "joinFieldName": "u", "lookup": "myVertex.label" } },
{ "edge": { "class": "myEdge",
"joinFieldName": "v",
"lookup": "myVertex.label",
"edgeFields": { "weight": "${input.weight}", "date": "${input.date}" },
"direction": "out",
"unresolvedLinkAction": "NOTHING"
}
},
{ "field": { "fieldNames": ["u", "v"], "operation": "remove" } }
],
Hope it helps.
I am creating a communication graph.
Each message has a msgid and each person has a userid.
I have already created the message vertices, now i want to create the user vertices and an edge connecting a message vertex to the user vertex.
A user can get multiple messages (obviously).
My file contains:
msgid, userid, (and some other info i will assign to the edge)
The isssue that i am having is that in my file i have duplicate userids (because users can get multiple messages), i dont want to create another vertex with the user id so i skipDuplicates. But if i do skip duplicates the edge will not get created either. I do want multiple edges to the same user vertex as each edge represents one message.
How do i keep the User vertex unique but create the edge?
My current ETL .json file that works fine with the exception of what i have detailed above.
{
"source": { "file": { "path": "msgs.txt" } },
"extractor": { "row": {} },
"transformers": [
{ "csv": {"separator": "\t"} },
{ "vertex": { "class": "user", "skipDuplicates": true } },
{ "edge": { "class": "sent_to", "joinFieldName": "msgid", "lookup":"message.id","direction": "in" } },
"edgeFields": { "n": "${input.n}" }
],
"loader": {
"orientdb": {
"dbURL": "remote:/localhost/databases/communication",
"dbType": "graph",
"classes": [
{"name": "user", "extends": "V"},
{"name": "message", "extends": "V"},
{"name": "sent_to", "extends": "E"}
], "indexes": [
{"class":"user", "fields":["id"], "type":"UNIQUE" }
]
}
}
}
Okay, here is what i did and it seemed to work.
First i created the message vertices (as stated above, in the q.).
Then i created the user vertices.
Then to create the edge in between them i ran the following ETL on a file that had {userid, msgid, ...}
{
"source": { "file": { "path": "msgs1.txt" } },
"extractor": { "row": {} },
"transformers": [
{ "csv": {"separator": "\t"} },
{ "merge": {"joinFieldName": "userid", "lookup": "user.id"} },
{ "vertex": { "class": "user", "skipDuplicates": true } },
{ "edge": { "class": "sent_to",
"joinFieldName": "msgid",
"lookup":"message.id",
"direction": "in",
"edgeFields": { "n": "${input.n}", "date": "${input.date}"}
}
}
],
"loader": {
"orientdb": {
"dbURL": "remote:/localhost/databases/communication",
"dbType": "graph",
"classes": [
{"name": "user", "extends": "V"},
{"name": "message", "extends": "V"},
{"name": "sent_to", "extends": "E"}
],
"indexes": [
]
}
}
}
This created all the edges, even if there was more than one edge pointing to a user.
Hopefully this will help someone
I have a graph in PostgresQL with 2 table:
vertex (id:uuid)
edge (from:uuid, to:uuid)
I use etl to transform it to orientdb for vertex:
{
"config": {
"log": "debug"
},
"extractor" : {
"jdbc": { "driver": "org.postgresql.Driver",
"url": "jdbc:postgresql://localhost/test",
"userName": "postgres",
"userPassword": "123456",
"query": "select id from vertex" }
},
"transformers" : [
{ "vertex": { "class": "vertex"} }
],
"loader" : {
"orientdb": {
"dbURL": "PLOCAL:../databases/test",
"dbUser": "admin",
"dbPassword": "admin",
"dbAutoCreate": true,
"dbAutoDropIfExists": false
}
}
}
When browsing the result in OrientDB, it seemed that the value i see in id field is not consistent with the result I got when query in PorgresQL.
Then I run etl for edge:
{
"config": {
"parallel" : true,
"log": "debug"
},
"extractor" : {
"jdbc": { "driver": "org.postgresql.Driver",
"url": "jdbc:postgresql://localhost/test",
"userName": "postgres",
"userPassword": "123456",
"query": "select from, to from edge"
}
},
"transformers": [
{ "merge": { "joinFieldName": "from", "lookup": "vertex.id" } },
{ "vertex": {"class": "vertex", "skipDuplicates": true} },
{ "edge": { "class": "has_edge", "joinFieldName": "to", "lookup": "Vertex.id", "direction": "in" } },
{ "field": { "fieldNames": ["from", "to"], "operation": "remove" } }
],
"loader" : {
"orientdb": {
"dbURL": "PLOCAL:../databases/test",
"dbUser": "admin",
"dbPassword": "admin",
"dbAutoCreate": true,
"dbAutoDropIfExists": false
}
}
}
The debug printed out console there were some edges that etl processor could not lookup references for from or to. This kind of message use uuids from PostgresQL. Since they are not inconsistent with the values I could query in OrientDB, it is hard to figure out which edges causing the problem.
My question is: how could I config my etl to make uuid from PostgresQL import properly to OrientDB?
I have a CSV file, having Id1 and Id2. Id1 and Id2 are vertex of two different classes. I want to make edge between Id1 and Id2. Can this be achieved by ETL?
Can we add something into the edge configuration of transformers to achieve this.
I will assume, that
the two classes are A and B
A has Id1
B has Id2
the class of the edge is AtoB
A and B instances are present in the DB
The AtoB.csv is like
AId,BId
a1,b1
a2,b2
a2,b3
Then the following ETL config will do
{
"source": { "file": { "path": "...\AtoB.csv" } },
"extractor": { "csv": { } },
"transformers": [
{ "merge": {
"joinFieldName": "BId",
"lookup": "B.Id2",
"unresolvedLinkAction": "WARNING" } },
{ "vertex": { "class": "B" } },
{ "edge": {
"class": "AtoB",
"joinFieldName": "AId",
"lookup": "A.Id1",
"direction": "in" } },
{ "field": {
"fieldNames": ["AId", "BId"],
"operation": "remove" } }
],
"loader": {
"orientdb": {
"dbURL": "plocal:../databases/...",
"dbType": "graph",
"useLightweightEdges": false,
"classes": [
{ "name": "A", "extends": "V" },
{ "name": "B", "extends": "V" },
{ "name": "AtoB", "extends": "E" }
]
}
}
}
The result will be
(a1) ➡ (b1)
(a2) ➡ (b2)
(a2) ➡ (b3)