AWS Glue pySpark Filter & Manual Mapping of Several Columns

AWS Glue pySpark Filter & Manual Mapping of Several Columns - pyspark

I'm using AWS Glue Studio with DynamicFrameCollections. I created a custom transformation where I am looking to filter by multiple columns and modify 2 column in the row based off a static mapping list. I'm struggling to figure out what the most efficient way to do this - pandas, udfs, or something completely different?
Consider the sample dataframe:
data = [{"Category": 'A', "Subcategory": 2, "Value": 121.44, "Properties": {}},
{"Category": 'B', "Subcategory": 2, "Value": 300.01, "Properties": None},
{"Category": 'C', "Subcategory": 3, "Value": 10.99, "Properties": { "Active":True } },
{"Category": 'E', "Subcategory": 4, "Value": 33.87, "Properties": { "Active":True, "ReadOnly": False }},
{"Category": 'E', "Subcategory": 1, "Value": 11.37, "Properties": { "Active":True }}
]
df = spark.createDataFrame(data)
I need to filter and transform by Category and Subcategory. Below is the sample mapping with the key as the category and subcategory merged while the first value in the array must be created as a new column ActivityName and the second values must be merged with the Properties column:
mapping= {"A2": ["EatingFood", { "Visible": True }],
"A3": ["DrinkingWater", { "Visible": False }],
"B2": ["Sleep", { "Visible": False }],
"C3": ["Exercise", { "Visible": False }],
"E4": ["Running", { "Visible": False }],
}
The output data I am expecting is:
resultingData = [{"Category": 'A', "Subcategory": 2, "ActivityName":"EatingFood", "Value": 121.44, "Properties": { "Visible": True }},
{"Category": 'B', "Subcategory": 2, "ActivityName":"Sleep", "Value": 300.01, "Properties": {"Visible": False}},
{"Category": 'C', "Subcategory": 3, "ActivityName":"Exercise", "Value": 10.99, "Properties": { "Active":True, "Visible": False } },
{"Category": 'E', "Subcategory": 4, "ActivityName":"Running", "Value": 33.87, "Properties": { "Active":True, "ReadOnly": False, "Visible": False }}
]
Note that the last data entry, E1 is missing because it was not in my mapping filter.
Is there any way to achieve this? I have a large list of items that I need to manually filter/map/transform like this. Thank you.

I got this working by transforming the dynamicframe into a dataframe and processing it using glue functions. Here's what I did:
def FilterAndMap (glueContext, dfc) -> DynamicFrameCollection:
from pyspark.sql.types import StringType, ArrayType
from awsglue.dynamicframe import DynamicFrame
import pyspark.sql.functions as f
import json
mapping= {"A2": ["EatingFood", json.dumps({ "Visible": True })],
"A3": ["DrinkingWater", json.dumps({ "Visible": False })],
"B2": ["Sleep", json.dumps({ "Visible": False })],
"C3": ["Exercise", json.dumps({ "Visible": False })],
"E4": ["Running", json.dumps({ "Visible": False })],
}
df = dfc.select(list(dfc.keys())[0]).toDF()
def func_filter_udf(concat_str):
return mapping[concat_str]
def func_map_udf(map_str):
if map_str[1]:
map_string = json.loads(map_str[0])
ret_val = json.dumps({**map_string, **json.loads(map_str[1])})
else:
ret_val = map_str[0]
return ret_val
filter_udf = f.udf(func_filter_udf, ArrayType(StringType()))
map_udf = f.udf(func_map_udf, StringType())
df = df.filter(f.concat("Category", "Subcategory").isin([*mapping]))
df = df.withColumn("concat_col", filter_udf(f.concat("Category", "Subcategory")))
df = (df.withColumn("ActivityName", df.concat_col[0]).
withColumn("Properties", map_udf(f.struct(df.concat_col[1], df.Properties))))
df = df.drop("concat_col")
dyf_processed = DynamicFrame.fromDF(df, glueContext, "filtered")
return(DynamicFrameCollection({"filtered": dyf_processed }, glueContext))

Related

plotly mapbox - create clusters in mapview

I am building Dash App that uses plotly scattermapbox graph object. In the current map view each point is represented as a circle. As a user zooms-in and out, I'd like to cluster the points and create groupings. Here's my code for reference.
import dash
from dash import dcc
import pandas as pd
df = pd.DataFrame({
'x': [1, 2, 3],
'Lat': [37.774322, 37.777035, 37.773033],
'Long': [-122.489761, -122.485555, -122.491220]
})
layout = html.Div(
dcc.Graph(id="map"),
dcc.Input(id="inp")
)
#app.callback(
Output('map','figure'),
Input('inp','value')
)
def fin(val):
# do something
data = []
data.append({
"type": "scattermapbox",
"lat": df["Lat"],
"lon": df["Long"],
"name": "Location",
"showlegend": False,
"hoverinfo": "text",
"mode": "markers",
"clickmode": "event+select",
"customdata": df.loc[:,cd_cols].values,
"marker": {
"symbol": "circle",
"size": 8,
"opacity": 0.7,
"color": "black"
}
}
)
layout = {
"autosize": True,
"hovermode": "closest",
"mapbox": {
"accesstoken": MAPBOX_KEY,
"bearing": 0,
"center": {
"lat": xxx,
"lon": xxx
},
"pitch": 0,
"zoom": zoom,
"style": "satellite-streets",
},
}
return ({'data': data, 'layout': layout})

try using plotly.graph_objects.scattermapbox.Cluster. Hope this helps:
from dash import dcc, html, Dash, Output, Input
import pandas as pd
import plotly.graph_objects as go
app = Dash(__name__)
df = pd.DataFrame({
'x': [1, 2, 3],
'Lat': [37.774322, 37.777035, 37.773033],
'Long': [-122.489761, -122.485555, -122.491220]
})
#app.callback(
Output('map','figure'),
Input('inp','value')
)
def fin(val):
data = []
data.append({
"type": "scattermapbox",
"lat": df["Lat"],
"lon": df["Long"],
"name": "Location",
"showlegend": False,
"hoverinfo": "text",
"mode": "markers",
"clickmode": "event+select",
"customdata": df.loc[:,['Lat', 'Long']].values,
"marker": {
"symbol": "circle",
"size": 8,
"opacity": 0.7,
"color": "black"
},
"cluster": {'maxzoom': 14}
}
)
layout = {
"autosize": True,
"hovermode": "closest",
"mapbox": {
"bearing": 0,
"center": {
"lat": 37.774322,
"lon": -122.489761
},
"pitch": 0,
"zoom": 7,
"style": "open-street-map",
},
}
return ({'data': data, 'layout': layout})
app.layout = html.Div(
[dcc.Graph(id="map"),
dcc.Input(id="inp")]
)
if __name__ == '__main__':
app.run_server(debug=True)
Notice the added cluster parameters I added to data.
p.s - make sure you are using a new version of dash for this to work. I used the latest version - dash-2.7.1.

Create JSON with class objects

I have almost ready what I want to do, however the method that converts to a JSON object does not help me to solve what is missing. I want to get the same thing, but there will be more content inside "add" and inside "firsts" and so I need them to be arrays of objects.
My code:
case class FirstIdentity(docType: String, docNumber: String, pId: String)
case class SecondIdentity(firm: String, code: String, orgType: String,
orgNumber: String, typee: String, perms: Seq[String])
case class General(id: Int, pName: String, description: String, add: Seq[SecondIdentity],
delete: Seq[String], act: String, firsts: Seq[FirstIdentity])
val someDF = Seq(
("0010XR_TYPE_6","0010XR", "222222", "6", "TYPE", "77444478", "6", 123, 1, "PF 1", "name", "description",
Seq("PERM1", "PERM2"))
).toDF("firm", "code", "org_number", "org_type", "type", "doc_number",
"doc_type", "id", "p_id", "p_name", "name", "description", "perms")
someDF.createOrReplaceTempView("vw_test")
val filter = spark.sql("""
select
firm, code, org_number, org_type, type, doc_number,
doc_type, id, p_id, p_name, name, description, perms
from vw_test
""")
val group =
filter.rdd.map(x => {
(
x.getInt(x.fieldIndex("id")),
x.getString(x.fieldIndex("p_name")),
x.getString(x.fieldIndex("description")),
SecondIdentity(
x.getString(x.fieldIndex("firm")),
x.getString(x.fieldIndex("code")),
x.getString(x.fieldIndex("org_type")),
x.getString(x.fieldIndex("org_number")),
x.getString(x.fieldIndex("type")),
x.getSeq(x.fieldIndex("perms"))
),
"act",
FirstIdentity(
x.getString(x.fieldIndex("doc_number")),
x.getString(x.fieldIndex("doc_type")),
x.getInt(x.fieldIndex("p_id")).toString
)
)
})
.toDF("id", "name", "desc", "add", "actKey", "firsts")
.groupBy("id", "name", "desc", "add", "actKey", "firsts")
.agg(collect_list("add").as("null"))
.drop("null")
group.toJSON.show(false)
result:
{
"id": 123,
"name": "PF 1",
"desc": "description",
"add": {
"firm": "0010XR_TYPE_6",
"code": "0010XR",
"orgType": "6",
"orgNumber": "222222",
"typee": "TYPE",
"perms": [
"PERM1",
"PERM2"
]
},
"actKey": "act",
"firsts": {
"docType": "77444478",
"docNumber": "6",
"pId": "1"
}
}
I want to have an array of "add" and also of "firsts"
this:
EDIT
{
"id": 123,
"name": "PF 1",
"desc": "description",
"add": [ <----
{
"firm": "0010XR_TYPE_6",
"code": "0010XR",
"orgType": "6",
"orgNumber": "222222",
"typee": "TYPE",
"perms": [
"PERM1",
"PERM2"
]
},
{
"firm": "0010XR_TYPE_6",
"code": "0010XR",
"orgType": "5",
"orgNumber": "11111",
"typee": "TYPE2",
"perms": [
"PERM1",
"PERM2"
]
}
],
"actKey": "act",
"firsts": [ <----
{
"docType": "77444478",
"docNumber": "6",
"pId": "1"
},
{
"docType": "411133",
"docNumber": "6",
"pId": "2"
}
]
}

As per your comment, you want to aggregate add depending on some grouping. Please check what all columns you want to group by. The columns which you want to Agrregate cannot be part of grouping. That will never work, and will give you always separate records.
This will work as per your expectations (I suppose):
val group =
filter.rdd.map(x => {
(
x.getInt(x.fieldIndex("id")),
x.getString(x.fieldIndex("p_name")),
x.getString(x.fieldIndex("description")),
SecondIdentity(
x.getString(x.fieldIndex("firm")),
x.getString(x.fieldIndex("code")),
x.getString(x.fieldIndex("org_type")),
x.getString(x.fieldIndex("org_number")),
x.getString(x.fieldIndex("type")),
x.getSeq(x.fieldIndex("perms"))
),
"act",
FirstIdentity(
x.getString(x.fieldIndex("doc_number")),
x.getString(x.fieldIndex("doc_type")),
x.getInt(x.fieldIndex("p_id")).toString
)
)
})
.toDF("id", "name", "desc", "add", "actKey", "firsts")
.groupBy("id", "name", "desc", "actKey")
.agg(collect_list("add").as("null"))
.drop("null")
Result:
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"id":123,"name":"PF 1","desc":"description","actKey":"act","collect_list(add)":[{"firm":"0010XR_TYPE_6","code":"0010XR","orgType":"6","orgNumber":"222222","typee":"TYPE","perms":["PERM1","PERM2"]},{"firm":"0010XR_TYPE_5","code":"0010XR","orgType":"5","orgNumber":"222223","typee":"TYPE","perms":["PERM1","PERM2"]}]}|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
Inside your map function, you are not mapping the FirstEntity and SecondEntity as Seq hence the add is not getting converted to array.
Change your map function to this:
filter.rdd.map(x => {
(
x.getInt(x.fieldIndex("id")),
x.getString(x.fieldIndex("p_name")),
x.getString(x.fieldIndex("description")),
Seq(SecondIdentity(
x.getString(x.fieldIndex("firm")),
x.getString(x.fieldIndex("code")),
x.getString(x.fieldIndex("org_type")),
x.getString(x.fieldIndex("org_number")),
x.getString(x.fieldIndex("type")),
x.getSeq(x.fieldIndex("perms"))
)),
"act",
Seq(FirstIdentity(
x.getString(x.fieldIndex("doc_number")),
x.getString(x.fieldIndex("doc_type")),
x.getInt(x.fieldIndex("p_id")).toString
))
)
})
Will result into this:
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"id":123,"name":"PF 1","desc":"description","add":[{"firm":"0010XR_TYPE_6","code":"0010XR","orgType":"6","orgNumber":"222222","typee":"TYPE","perms":["PERM1","PERM2"]}],"actKey":"act","firsts":[{"docType":"77444478","docNumber":"6","pId":"1"}]}|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

Flutter orderby json data

I have data like:
[
{name: Imamat, short_name: Lev, chapters: 27, name_en: Leviticus, type: old, order: 3},
{name: Kejadian, short_name: Gen, chapters: 50, name_en: Genesis, type: old, order: 1},
//....
]
I need to return this data sorted by order value like 1,2,3,4,.... Here is my function that returns results in up:
readDataBase() async {
String data = await DefaultAssetBundle.of(context).loadString("assets/db/tb.json");
final jsonResult = jsonDecode(data);
return jsonResult['book'];
}

This should do the trick efficiently;
List items = [
{"name": "Imamat", "short_name": "Lev", 'chapters': 27, "name_en": "Leviticus", "type": "old", "order": 3},
{"name": "Kejadian", "short_name": "Gen", "chapters": 50, "name_en": "Genesis", "type": "old", "order": 1},
];
getOrderNo (e)=>e["order"];
items.sort((a, b) => getOrderNo(a).compareTo(getOrderNo(b)));
print(items);

How filter features by status

I want to show in my map in a cluster layer filtering by if is opened or not. How can do it? Should I create two layers?
One with filter: filter["has", "opened"] and other with filter: filter["!", ["has", "opened"]]?
export const clusterLayerOpened = {
id: "clusters",
type: "circle",
source: "earthquakes",
filter: ["has", "opened"],
paint: {
"circle-color": [ "step", ["get", "opened"], "#51bbd6",100,"#f1f075",750,"#f28cb1", ],
"circle-radius": ["step", ["get", "opened"], 20, 100, 30, 750, 40],
},
};
export const clusterLayerNoOpened = {
id: "clusters",
type: "circle",
source: "earthquakes",
filter: ["!", ["has", "opened"]],
paint: {
"circle-color": [ "step", ["get", "opened"], "#51bbd6",100,"#f1f075",750,"#f28cb1", ],
"circle-radius": ["step", ["get", "opened"], 20, 100, 30, 750, 40],
},
};
This is my geojson:
{
"type": "FeatureCollection",
"features": [{
"type": "Feature",
"properties": {
"id": "ak16994521",
"mag": 2.3,
"time": 1507425650893,
"felt": null,
"tsunami": 0,
"opened": true
},
"geometry": {
"type": "Point",
"coordinates": [-151.5129, 63.1016, 0.0]
}
},
{
"type": "Feature",
"properties": {
"id": "ak16994519",
"mag": 1.7,
"time": 1507425289659,
"felt": null,
"tsunami": 0,
"opened": false
},
"geometry": {
"type": "Point",
"coordinates": [-150.4048, 63.1224, 105.5]
}
}
]
}

It's not necessary to create two separate layers to filter by whether the a point has been opened or not. Here is some code showing how to add a layer which displays all points with the property "opened": true, and hides all points with "opened": false:
map.addLayer({
'id': 'opened',
'type': 'circle',
'source': 'points',
'paint': {
'circle-radius': 10,
'circle-opacity': ["match", ["to-string", ["get", "opened"]], 'true', 1 , 'false', 0, 0]
}
});
To instead show all points with the property "opened": false, you can switch the 'circle-opacity' expression to read:
["match", ["to-string", ["get", "opened"]], 'true', 0 , 'false', 1, 0]
This code makes use of a few Mapbox expressions. I've linked the documentation to each relevant expression here: match, to-string, and get.
Here is a JSFiddle where two layers are added to the map: https://jsfiddle.net/hpkzrm4n/. The points with "opened": true are shown in red and the points with "opened": false are shown in blue. Note that you will need to add your own Mapbox access token where indicated in order to view the results. Here is a screenshot, as a preview:

Can't get Service Alerts Protobuff to include header_text or description_text using Python gtfs_realtime_pb2 module

We are having difficulty adding header_text and description_text to a Service Alerts protobuff file. We are attempting to match the example shown on this page here.
https://developers.google.com/transit/gtfs-realtime/examples/alerts
Our data starts in the following dictionary:
alerts_dict = {
"header": {
"gtfs_realtime_version": "1",
"timestamp": "1543318671",
"incrementality": "FULL_DATASET"
},
"entity": [{
"497": {
"active_period": [{
"start": 1525320000,
"end": 1546315200
}],
"url": "http://www.capmetro.org/planner",
"effect": 4,
"header_text": "South 183: Airport",
"informed_entity": [{
"route_type": "3",
"route_id": "17",
"trip": "",
"stop_id": "3304"
}, {
"route_type": "3",
"route_id": "350",
"trip": "",
"stop_id": "3304"
}],
"description_text": "Stop closed temporarily",
"cause": 2
},
"460": {
"active_period": [{
"start": 1519876800,
"end": 1546315200
}],
"url": "http://www.capmetro.org/planner",
"effect": 4,
"header_text": "Ave F / Duval Detour",
"informed_entity": [{
"route_type": "3",
"route_id": "7",
"trip": "",
"stop_id": "1167"
}, {
"route_type": "3",
"route_id": "7",
"trip": "",
"stop_id": "1268"
}],
"description_text": "Stop closed temporarily",
"cause": 2
}
}]
}
Our Python code is as follows:
newfeed = gtfs_realtime_pb2.FeedMessage()
newfeedheader = newfeed.header
newfeedheader.gtfs_realtime_version = '2.0'
for alert_id, alert_dict in alerts_dict["entity"][0].iteritems():
print(alert_id)
print(alert_dict)
newentity = newfeed.entity.add()
newalert = newentity.alert
newentity.id = str(alert_id)
newtimerange = newalert.active_period.add()
newtimerange.end = alert_dict['active_period'][0]['end']
newtimerange.start = alert_dict['active_period'][0]['start']
for informed in alert_dict['informed_entity']:
newentityselector = newalert.informed_entity.add()
newentityselector.route_id = informed['route_id']
newentityselector.route_type = int(informed['route_type'])
newentityselector.stop_id = informed['stop_id']
print(alert_dict['description_text'])
newdescription = newalert.header_text
newdescription = alert_dict['description_text']
newalert.cause = alert_dict['cause']
newalert.effect = alert_dict['effect']
pb_feed = newfeed.SerializeToString()
with open("servicealerts.pb", 'wb') as fout:
fout.write(pb_feed)
The frustrating part is that we don't receive any sort of error message. Everything appears to run properly but the resulting pb file doesn't contain the new header_text or description_text items.
We are able to read the pb file using the following code:
feed = gtfs_realtime_pb2.FeedMessage()
response = open("servicealerts.pb")
feed.ParseFromString(response.read())
print(feed)
We truly appreciate any help that anyone can offer in pointing us in the right direction of figuring this out.

I was able to find the answer. This Python Notebook showed that by properly formatting the dictionary the PB could be generated with a few of lines of code.
from google.transit import gtfs_realtime_pb2
from google.protobuf.json_format import MessageToDict
newfeed = gtfs_realtime_pb2.FeedMessage()
ParseDict(alerts_dict, newfeed)
pb_feed = newfeed.SerializeToString()
with open("servicealerts.pb", 'wb') as fout:
fout.write(pb_feed)
All I had to do was format by dictionary properly.
if ALERT_GROUP_ID not in entity_dict.keys():
entity_dict[ALERT_GROUP_ID] = {"id": ALERT_GROUP_ID,
"alert":{
"active_period": [{
"start": int(START_TIME),
"end": int(END_TIME)
}],
"cause": cause_dict.get(CAUSE, ""),
"effect": effect_dict.get(EFFECT),
"url": {
"translation": [{
"text": URL,
"language": "en"
}]
},
"header_text": {
"translation": [{
"text": HEADER_TEXT,
"language": "en"
}]
},
"informed_entity": [{
'route_id': ROUTE_ID,
'route_type': ROUTE_TYPE,
'trip': TRIP,
'stop_id': STOP_ID
}],
"description_text": {
"translation": [{
"text": "Stop closed temporarily",
"language": "en"
}]
},
},
}
# print(entity_dict[ALERT_GROUP_ID]["alert"]['informed_entity'])
else:
entity_dict[ALERT_GROUP_ID]["alert"]['informed_entity'].append({
'route_id': ROUTE_ID,
'route_type': ROUTE_TYPE,
'trip': TRIP,
'stop_id': STOP_ID
})

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

AWS Glue pySpark Filter & Manual Mapping of Several Columns - pyspark

Related

plotly mapbox - create clusters in mapview

Create JSON with class objects

Flutter orderby json data

How filter features by status

Can't get Service Alerts Protobuff to include header_text or description_text using Python gtfs_realtime_pb2 module

Categories

Resources