nearest building with open street map - openstreetmap

I have a csv of relevant points with latitude and longitude and trying to get the nearest
building data to each point and add a column to the csv (or panda) in python. Tried using Pyrosm and various libraries but can't seem to prune the data to get the nearest building and then add the data. Thanks
This is what I have
from pyrosm import OSM
from pyrosm import get_data
import geopandas as gpd
from sklearn.neighbors import BallTree
import numpy as np
import osmnx as ox
# get rid of weird error
import shapely
import warnings
from shapely.errors import ShapelyDeprecationWarning
import csv
def get_gig_data(csv_fname):
with open(csv_fname, "r", encoding="latin-1") as gig_records:
for gig_record in csv.reader(gig_records):
yield gig_record
def main():
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning)
chicago_osm = OSM(get_data("chicago"))
#get a Point of Interest GeoDataFrame
points_of_interest = chicago_osm.get_pois() #can use a custom filter if we want to filter the types, but I think no filter might be the best
# get buildings nodes and edges
nodes, edges = chicago_osm.get_network(nodes=True, network_type="walking")
buildings = chicago_osm.get_buildings()
b_cnt = len(buildings)
G = chicago_osm.to_graph(nodes, edges)
#nodes = get_igraph_nodes(G)
buildings['geometry'] = buildings.centroid
# poi_list = np.asarray([ point.coords for point in points_of_interest['geometry'] ]) #if point.geom_type == point])
#print(poi_list.shape)
#tree = BallTree( np.asarray([ point.coords for point in points_of_interest['geometry'] if point.geom_type == point]), metric="manhattan") #Note: the scipy implementation of manhattan/cityblock distance might be faster according to the internet bc it uses a C function
#Read in the gig work data - I think the best way to do this will probably be with the CSV.reader with open thing because it will go line by line and save a ton of memory
'''for i in points_of_interest:
print('Type: ', type(i) , ' ',i)'''
gig_fp = "data_sample.csv"
#gig_data = gpd.read_file(gig_fp)
iter_gig = iter(get_gig_data(gig_fp))
next(iter_gig)
ids=dict()
for building in buildings.iterrows():
#print(type(building[1][32]) , ' ', building[1][32])
#tup = tuple(float(x) for x in [trip[17][8:-1].split()])
ids[building[1][32]] = building
#make the tree that determines closest POI
#if we use the CSV reader this for loop will be done already
for trip in iter_gig:
# Using generator so this should be efficient memory wise.
tup = tuple([float(x) for x in trip[17][8:-1].split() ])
print(type(tup), ' ', tup)
src_ids,euclidean_distance=ox.distance.nearest_nodes(G,tup)
src_ids, euclidean_distance= ox.distance.nearest_nodes(G,tup)
# find nearest node
#THEN ADD THE PICKUP AND DROPOFF IDS TO THIS TUPLE AND ADD TO A NEW NP ARRAY
if __name__ == '__main__':
main()

Related

'SequentialFeatureSelector' object has no attribute 'ranking_'

everyone,
I would like to sort the best features that I'm getting from SequentialFeatureSelector based on their ranks.
But I get this error:
'SequentialFeatureSelector' object has no attribute 'ranking_'
Can you help me to solve it and sort my features based on their importance?
`import pandas as pd
df=pd.read_csv('')
#Deviding X and Y
Y=df.iloc[:,-1:]
X = df[df.columns.drop(Y)]
#Selecting the K best features
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
sfs = SequentialFeatureSelector(model, n_features_to_select=5)
fit = sfs.fit(X, Y)
#Selecting them and showing them in a dataframe
names = X.columns.values
ranking = sfs.ranking_
names_scores = list(zip(names, ranking))
ns_df = pd.DataFrame(data = names_scores, columns=['Feature_names', 'ranks'])
L=ns_df.sort_values('ranks', ascending=False)
L`

A scalable Graph method for finding cliques for complete connected components PySpark

I'm trying to split GraphFrame connectedComponent output for each component to have a sub-group for each complete connected, meaning all vertices are connected to each other. the following sketch will help demonstrate what I'm trying to achieve
I'm using NetworkX method in order to achive it as following
def create_subgroups(edges,components, key_name = 'component'):
# joining the edges to enrich component id
sub_components = edges.join(components,[(edges.dst == components.id) | (edges.src == components.id)]).select('src','dst',key_name).drop_duplicates()
# caching the table using temp table
sub_components = save_temp_table(sub_components,f'inner_sub_{key_name}s', zorder = [key_name])
schema = StructType([ \
StructField("index",LongType(),True), \
StructField("id",StringType(),True), \
])
# applying pandas udf to enrich each vertices with the new component id
sub_components = sub_components.groupby(key_name).applyInPandas(pd_create_subgroups, schema).where('id != "not_connected"').drop_duplicates()
# joining the output and mulitplying each vertices by the time of sub-groups were found
components = components.join(sub_components,'id','left')
components = components.withColumn(key_name,when(col('index').isNull(),col(key_name)).otherwise(concat(col(key_name),lit('_'),concat('index')))).drop('index')
return components
import networkx as nx
from networkx.algorithms.clique import find_cliques
def pd_create_subgroups(pdf):
# building the graph
gnx = nx.from_pandas_edgelist(pdf,'src','dst')
# removing one degree nodes
outdeg = gnx.degree()
to_remove = [n[0] for n in outdeg if n[1] == 1]
gnx.remove_nodes_from(to_remove)
bic = list(find_cliques(gnx))
if len(bic)<=2:
return pd.DataFrame(data = {"index":[-1],"id":["not_connected"]})
res = {
"index":[],
"id":[]
}
ind = 0
for i in bic:
if len(i)<3:
continue
for id in i:
res['index'] = res['index'] + [ind]
res['id'] = res['id'] + [id]
ind += 1
return pd.DataFrame(res)
# creating sub-components if necessary
subgroups = create_subgroups(edges,components, key_name = 'component')
My problem is that there's a very large component containing 80% of the vertices causing very slow performance of the clusters. I've been trying to use labelPropagation to create smaller groups but it wouldn't do the trick. it has split it in a way that isn't suitable causing a split of vertices that should have been in the same groups.
Here's the cluster usage when it reaches the pandas_udf part
This issue was resolved by separating vertices into N groups, pulling all edges for each vertice in the group, and calculating the sub-group using the find_cliques method.

Can I draw a bipartite graph from every dataset?

I am trying to draw a bipartite graph for my data set, which is like below:
source target weight
reduce energy 25
reduce consumption 25
energy pennsylvania 4
energy natural 4
consumption balancing 4
the code That I am trying to plot the graph is as below:
C_2021 = nx.Graph()
C_2021.add_nodes_from(df_final_2014['source'], bipartite=0)
C_2021.add_nodes_from(df_final_2014['target'], bipartite=1)
edges = df_final_2014[['source', 'target','weight']].apply(tuple, axis=1)
C_2021.add_weighted_edges_from(edges)
But when I check with the below code whether it is bipartite or not, I get the "False" feedback.
nx.is_bipartite(C_2021)
Could you please advise what the issue is?
The previous issue is resolved, but when I want to plot the bipartite graph with the below steps, I do not get a proper result. If someone could help me, I will be appreciated it:
top_nodes_2021 = set(n for n,d in C_2021.nodes(data=True) if d['bipartite']==0)
top_nodes_2021
the output of the above is:
{'reduce'}
bottom_nodes_2021 = set(C_2021) - top_nodes_2021
bottom_nodes_2021
the output of the above is:
{'balancing', 'consumption', 'energy', 'natural', 'pennsylvania '}
then plot it by:
pos = nx.bipartite_layout(C_2021,top_nodes_2021)
plt.figure(figsize=[8,6])
# Pass that layout to nx.draw
nx.draw(C_2021,pos,node_color='#A0CBE2',edge_color='black',width=0.2,
edge_cmap=plt.cm.Blues,with_labels=True)
and the result is:
It works for me using your code. nx.is_bipartite(C_2021) returns true. Check the example below:
import sys
if sys.version_info[0] < 3:
from StringIO import StringIO
else:
from io import StringIO
import pandas as pd
data = StringIO('''source;target;weight
reduce;energy;25
reduce;consumption;25
energy;pennsylvania ;4
energy;natural;4
consumption;balancing;4
''')
df_final_2014 = pd.read_csv(data, sep=";")
C_2021 = nx.Graph()
C_2021.add_nodes_from(df_final_2014['source'], bipartite=0)
C_2021.add_nodes_from(df_final_2014['target'], bipartite=1)
edges = df_final_2014[['source', 'target','weight']].apply(tuple, axis=1)
C_2021.add_weighted_edges_from(edges)
nx.is_bipartite(C_2021)
Finally to draw them get the bipartite sets. The data you passed during the creation is false (i.g. bipartite=0 and bipartite=1).
Use the following commands:
from networkx.algorithms import bipartite
top_nodes_2021, bottom_nodes_2021 = bipartite.sets(C_2021)
pos = nx.bipartite_layout(C_2021, top_nodes_2021)
plt.figure(figsize=[8,6])
# Pass that layout to nx.draw
nx.draw(C_2021,pos,node_color='#A0CBE2',edge_color='black',width=0.2,
edge_cmap=plt.cm.Blues,with_labels=True)
With the following result:

Why scipy.griddata is much slower than matlab's griddata?

I could find some existing topics about this but somehow I could not find an answer...
Here is a python example taken from https://gist.github.com/fjarri/b6f1faefa95995d119b8 (already used in Why is scipy.interpolate.griddata so slow?), giving
Python:
import time
import numpy as np
from scipy.interpolate import griddata
def func(x, y):
return x*(1-x)*np.cos(4*np.pi*x) * np.sin(4*np.pi*y**2)**2
grid_x, grid_y = np.mgrid[0:1:200j, 0:1:200j]
points = np.random.rand(410500, 2)
values = func(points[:,0], points[:,1])
t1 = time.time()
grid_z1 = griddata(points, values, (grid_x, grid_y), method='linear')
print(time.time() - t1)
the print gives always about 6.4secs
Matlab :
[grid_x, grid_y] = meshgrid(1:200, 1:200);
points = rand(410500, 2);
x=points(:,1);
y=points(:,2);
values = x.*(1-x).*cos(4*pi*x).*sin(4*pi*y.^2).^2;
tic;vq = griddata(x,y,values,grid_x,grid_y,'linear');toc;
the print gives always about 2.4secs.
Someone knows why is there such a big difference between the two software ? And if there is a solution to accelerate scipy.griddata ? I deal with many large arrays of scattered points and griddata is responsible for most of my computation time... but I need to use python for this and it is very slow.

Traffic Sign detection and Recognition using Neural networks

I wanted to detect and recognize traffic signs from a video feed. I used the Tensorflow ML framework for recognition of signs and used haar classifier for detection of signs.
Here is the code:
import cv2
import numpy as np
import tensorflow as tf
import os,time
import threading
# constants
IMAGE_SIZE = 200.0
MATCH_THRESHOLD = 3
def SignRecognizer():
#to neglect all tensorflow compilation warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
#path to the blob
image_path='/root/Desktop/blob.jpg'
#read the image data
image_data = tf.gfile.FastGFile(image_path,'rb').read()
#load label file,strip off carriage return \n
label_lines= [line.rstrip() for line in tf.gfile.GFile("/root/Desktop/another_model/retrained_labels.txt")]
#unpersists graph from file
with tf.gfile.FastGFile("/root/Desktop/another_model/retrained_graph.pb",'rb') as f:
graph_def=tf.GraphDef()
graph_def.ParseFromString(f.read())
_=tf.import_graph_def(graph_def,name='')
with tf.Session() as sess:
#feed the image_data as input to the graph and get the first prediction
softmax_tensor=sess.graph.get_tensor_by_name("final_result:0")
predictions = sess.run(softmax_tensor,\
{'DecodeJpeg/contents:0':image_data})
#sort to show labels of first prediction in order of confidence
top_k=predictions[0].argsort()[-len(predictions[0]):][::-1]
for node_id in top_k:
human_string=label_lines[node_id]
print("%s"%(human_string))
break
roundabout_cascade = cv2.CascadeClassifier("/root/Desktop/tsp/haarcascade_roundabout.xml")
videocapture = cv2.VideoCapture(0)
scale_factor=1.3
while 1:
ret,pic = videocapture.read()
# do roundabout detection on street image
gray = cv2.cvtColor(pic,cv2.COLOR_RGB2GRAY)
signs = roundabout_cascade.detectMultiScale(pic,scaleFactor=1.4,minNeighbors=6)
# initialize ORB and BFMatcher
orb = cv2.ORB_create()
bf = cv2.BFMatcher(cv2.NORM_HAMMING,crossCheck=True)
# find the keypoints and descriptors for roadsign image
roadsign = cv2.imread("/root/Desktop/tsp/roundabout.jpg",0)
kp_r,des_r = orb.detectAndCompute(roadsign,None)
for (x,y,w,h) in signs:
#cv2.rectangle(pic,(x,y),(x+w,y+h),(255,0,0),2)
# obtain object from street image
obj = gray[y:y+h,x:x+w]
color_image=pic[y:y+h,x:x+w]
cv2.imwrite("/root/Desktop/blob.jpg",color_image)
cv2.imshow('blob', color_image)
#start a new thread and run SignRecognizer on it
t=threading.Thread(name="SignRecognizer",target=SignRecognizer)
#set the thread as a daemon to prevent blocking of the main program
t.setDaemon(True)
t.start()
ratio = IMAGE_SIZE / obj.shape[1]
obj = cv2.resize(obj,(int(IMAGE_SIZE),int(obj.shape[0]*ratio)))
# find the keypoints and descriptors for object
kp_o, des_o = orb.detectAndCompute(obj,None)
if len(kp_o) == 0 or des_o == None:
continue
# match descriptors
matches = bf.match(des_r,des_o)
# draw object on street image, if threshold met
if(len(matches) >= MATCH_THRESHOLD):
cv2.rectangle(pic,(x,y),(x+w,y+h),(255,0,0),2)
font = cv2.FONT_HERSHEY_SIMPLEX
cv2.putText(pic,'Roundabout sign',(x,y),font,1,(255,255,255),1,cv2.LINE_AA)
cv2.imshow('roundabout_signs',pic)
k = cv2.waitKey(30) & 0xFF
if k==2:
break
cv2.waitKey(0)
cv2.destroyAllWindows()
The SignRecognizer function reads the blob image file and recognizes the sign using the model I created using tensorflow ML Framework.
I used VideoCapture(0) to start the webcam and simulate a live video feed.
I also used OpenCV's ORB ( Oriented FAST and rotated BRIEF) to remove false positives.
I used threading module to run the SignRecognizer on another thread and set it as a daemon so that the main pgm. wasn't blocked during recognition.
Everything works great but there seems to be a little lag inspite of using threading module.Is there any way to make it lag free?