'SequentialFeatureSelector' object has no attribute 'ranking_' - feature-selection

everyone,
I would like to sort the best features that I'm getting from SequentialFeatureSelector based on their ranks.
But I get this error:
'SequentialFeatureSelector' object has no attribute 'ranking_'
Can you help me to solve it and sort my features based on their importance?
`import pandas as pd
df=pd.read_csv('')
#Deviding X and Y
Y=df.iloc[:,-1:]
X = df[df.columns.drop(Y)]
#Selecting the K best features
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
sfs = SequentialFeatureSelector(model, n_features_to_select=5)
fit = sfs.fit(X, Y)
#Selecting them and showing them in a dataframe
names = X.columns.values
ranking = sfs.ranking_
names_scores = list(zip(names, ranking))
ns_df = pd.DataFrame(data = names_scores, columns=['Feature_names', 'ranks'])
L=ns_df.sort_values('ranks', ascending=False)
L`

Related

nearest building with open street map

I have a csv of relevant points with latitude and longitude and trying to get the nearest
building data to each point and add a column to the csv (or panda) in python. Tried using Pyrosm and various libraries but can't seem to prune the data to get the nearest building and then add the data. Thanks
This is what I have
from pyrosm import OSM
from pyrosm import get_data
import geopandas as gpd
from sklearn.neighbors import BallTree
import numpy as np
import osmnx as ox
# get rid of weird error
import shapely
import warnings
from shapely.errors import ShapelyDeprecationWarning
import csv
def get_gig_data(csv_fname):
with open(csv_fname, "r", encoding="latin-1") as gig_records:
for gig_record in csv.reader(gig_records):
yield gig_record
def main():
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning)
chicago_osm = OSM(get_data("chicago"))
#get a Point of Interest GeoDataFrame
points_of_interest = chicago_osm.get_pois() #can use a custom filter if we want to filter the types, but I think no filter might be the best
# get buildings nodes and edges
nodes, edges = chicago_osm.get_network(nodes=True, network_type="walking")
buildings = chicago_osm.get_buildings()
b_cnt = len(buildings)
G = chicago_osm.to_graph(nodes, edges)
#nodes = get_igraph_nodes(G)
buildings['geometry'] = buildings.centroid
# poi_list = np.asarray([ point.coords for point in points_of_interest['geometry'] ]) #if point.geom_type == point])
#print(poi_list.shape)
#tree = BallTree( np.asarray([ point.coords for point in points_of_interest['geometry'] if point.geom_type == point]), metric="manhattan") #Note: the scipy implementation of manhattan/cityblock distance might be faster according to the internet bc it uses a C function
#Read in the gig work data - I think the best way to do this will probably be with the CSV.reader with open thing because it will go line by line and save a ton of memory
'''for i in points_of_interest:
print('Type: ', type(i) , ' ',i)'''
gig_fp = "data_sample.csv"
#gig_data = gpd.read_file(gig_fp)
iter_gig = iter(get_gig_data(gig_fp))
next(iter_gig)
ids=dict()
for building in buildings.iterrows():
#print(type(building[1][32]) , ' ', building[1][32])
#tup = tuple(float(x) for x in [trip[17][8:-1].split()])
ids[building[1][32]] = building
#make the tree that determines closest POI
#if we use the CSV reader this for loop will be done already
for trip in iter_gig:
# Using generator so this should be efficient memory wise.
tup = tuple([float(x) for x in trip[17][8:-1].split() ])
print(type(tup), ' ', tup)
src_ids,euclidean_distance=ox.distance.nearest_nodes(G,tup)
src_ids, euclidean_distance= ox.distance.nearest_nodes(G,tup)
# find nearest node
#THEN ADD THE PICKUP AND DROPOFF IDS TO THIS TUPLE AND ADD TO A NEW NP ARRAY
if __name__ == '__main__':
main()

A scalable Graph method for finding cliques for complete connected components PySpark

I'm trying to split GraphFrame connectedComponent output for each component to have a sub-group for each complete connected, meaning all vertices are connected to each other. the following sketch will help demonstrate what I'm trying to achieve
I'm using NetworkX method in order to achive it as following
def create_subgroups(edges,components, key_name = 'component'):
# joining the edges to enrich component id
sub_components = edges.join(components,[(edges.dst == components.id) | (edges.src == components.id)]).select('src','dst',key_name).drop_duplicates()
# caching the table using temp table
sub_components = save_temp_table(sub_components,f'inner_sub_{key_name}s', zorder = [key_name])
schema = StructType([ \
StructField("index",LongType(),True), \
StructField("id",StringType(),True), \
])
# applying pandas udf to enrich each vertices with the new component id
sub_components = sub_components.groupby(key_name).applyInPandas(pd_create_subgroups, schema).where('id != "not_connected"').drop_duplicates()
# joining the output and mulitplying each vertices by the time of sub-groups were found
components = components.join(sub_components,'id','left')
components = components.withColumn(key_name,when(col('index').isNull(),col(key_name)).otherwise(concat(col(key_name),lit('_'),concat('index')))).drop('index')
return components
import networkx as nx
from networkx.algorithms.clique import find_cliques
def pd_create_subgroups(pdf):
# building the graph
gnx = nx.from_pandas_edgelist(pdf,'src','dst')
# removing one degree nodes
outdeg = gnx.degree()
to_remove = [n[0] for n in outdeg if n[1] == 1]
gnx.remove_nodes_from(to_remove)
bic = list(find_cliques(gnx))
if len(bic)<=2:
return pd.DataFrame(data = {"index":[-1],"id":["not_connected"]})
res = {
"index":[],
"id":[]
}
ind = 0
for i in bic:
if len(i)<3:
continue
for id in i:
res['index'] = res['index'] + [ind]
res['id'] = res['id'] + [id]
ind += 1
return pd.DataFrame(res)
# creating sub-components if necessary
subgroups = create_subgroups(edges,components, key_name = 'component')
My problem is that there's a very large component containing 80% of the vertices causing very slow performance of the clusters. I've been trying to use labelPropagation to create smaller groups but it wouldn't do the trick. it has split it in a way that isn't suitable causing a split of vertices that should have been in the same groups.
Here's the cluster usage when it reaches the pandas_udf part
This issue was resolved by separating vertices into N groups, pulling all edges for each vertice in the group, and calculating the sub-group using the find_cliques method.

Why scipy.griddata is much slower than matlab's griddata?

I could find some existing topics about this but somehow I could not find an answer...
Here is a python example taken from https://gist.github.com/fjarri/b6f1faefa95995d119b8 (already used in Why is scipy.interpolate.griddata so slow?), giving
Python:
import time
import numpy as np
from scipy.interpolate import griddata
def func(x, y):
return x*(1-x)*np.cos(4*np.pi*x) * np.sin(4*np.pi*y**2)**2
grid_x, grid_y = np.mgrid[0:1:200j, 0:1:200j]
points = np.random.rand(410500, 2)
values = func(points[:,0], points[:,1])
t1 = time.time()
grid_z1 = griddata(points, values, (grid_x, grid_y), method='linear')
print(time.time() - t1)
the print gives always about 6.4secs
Matlab :
[grid_x, grid_y] = meshgrid(1:200, 1:200);
points = rand(410500, 2);
x=points(:,1);
y=points(:,2);
values = x.*(1-x).*cos(4*pi*x).*sin(4*pi*y.^2).^2;
tic;vq = griddata(x,y,values,grid_x,grid_y,'linear');toc;
the print gives always about 2.4secs.
Someone knows why is there such a big difference between the two software ? And if there is a solution to accelerate scipy.griddata ? I deal with many large arrays of scattered points and griddata is responsible for most of my computation time... but I need to use python for this and it is very slow.

Getting model parameters from regression in Pyspark in efficient way for large data

I have created a function for applying OLS regression and just getting the model parameters. I used groupby and applyInPandas but it's taking too much of time. Is there are more efficient way to work around this?
Note: I din't had to use groupby as all features have many levels but as I cannot use applyInPandas without it so I created a dummy feature as 'group' having the same value as 1.
Code
import pandas as pd
import statsmodels.api as sm
from pyspark.sql.functions import lit
pdf = pd.DataFrame({
'x':[3,6,2,0,1,5,2,3,4,5],
'y':[0,1,2,0,1,5,2,3,4,5],
'z':[2,1,0,0,0.5,2.5,3,4,5,6]})
df = sqlContext.createDataFrame(pdf)
result_schema =StructType([
StructField('index',StringType()),
StructField('coef',DoubleType())
])
def ols(pdf):
y_column = ['z']
x_column = ['x', 'y']
y = pdf[y_column]
X = pdf[x_column]
model = sm.OLS(y, X).fit()
param_table =pd.DataFrame(model.params, columns = ['coef']).reset_index()
return param_table
#adding a new column to apply groupby
df = df.withColumn('group', lit(1))
#applying function
data = df.groupby('group').applyInPandas(ols, schema = result_schema)
Final output sample
index coef
x 0.183246073
y 0.770680628

OSMnx: Creating Custom Queries with Alternative Infrastructures

I'm new to OSMnx and Overpass queries in general. I am trying to understand the correct way to write custom queries when working with non-street infrastructure types.
Specifically, I am trying to understand why this query works
import osmnx as ox
my_custom_filter = '["railway"~"disused"]'
G = ox.graph_from_point((51.5073509,-0.1277583),
distance = 10000,
distance_type = 'bbox',
infrastructure = 'way["railway]',
network_type = 'none',
custom_filter = my_custom_filter
)
But this one wields a bad request error:
import osmnx as ox
my_custom_filter = '["railway"~"disused"]'
G = ox.graph_from_point((51.5073509,-0.1277583),
distance = 10000,
distance_type = 'bbox',
infrastructure = 'way["railway~"rail"]',
network_type = 'none',
custom_filter = my_custom_filter
)
Notice the difference is simply that I specifying rail as the type of railway in the latter query.
See the OSM Railway Guide here.
If anyone can point me to any resources which would help me further understand how to construct custom filters - particularly custom filters with more than one filter, that would be excellent also. For example, what would be the correct syntax to add an additional customer filter.
You were just missing a " in your argument. This works:
import osmnx as ox
ox.config(log_console=True, use_cache=True)
point = (51.5073509,-0.1277583)
dist = 10000
dt = 'bbox'
cf = '["railway"~"disused"]'
G = ox.graph_from_point(point, dist=dist, dist_type=dt, custom_filter=cf)
But it produces an EmptyOverpassResponse error as there is nothing that matches your query in that search area. You will get a graph however if you change it to this for example:
cf = '["railway"!~"disused"]'
G = ox.graph_from_point(point, dist=dist, dist_type=dt, custom_filter=cf)