Python: k-means clustering

Python: k-means clustering - python

I’m trying to use k-means clustering for the data of longitudes and latitudes of a .csv file but instead of plotting a graph I only want to get and print the centroids so that i can search them on google maps. Does anyone know how to code that?
import pandas as pd
import numpy as np
import csv
with open('fileName.csv', 'r') as infile:
csv_reader = csv.reader (infile, delimiter=',')
x = []
y = []
for row in csv_reader:
if row[3] != 'LONGITUDE':
x.append(float(row[3]))
y.append(float(row[4]))
df = pd.DataFrame({
'x': x,
'y': y
})
#implement x and y in k-means and print the centroids

I would suggest looking at the Shapely library.
from shapely.geometry import MultiPoint
import pandas as pd
import numpy as np
import csv
# Easy way to read your csv file in
df = pd.read_csv('fileName.csv').rename(columns={'LONGITUDE': 'x', 'LATITUDE': 'y'})
# Assumes you have a column 'cluster_id' that references the cluster id for each coordinate
cluster_ids = df['cluster_id'].unique()
kmeans_clusters = []
for cluster_id in cluster_ids:
# Filtered df for each cluster id
cluster_df = df.loc[df['cluster_id'] == cluster_id]
x_values = cluster_df['x'].tolist()
y_values = cluster_df['y'].tolist()
xy_pairs = [point for point in zip(x_values, y_values)]
kmeans_clusters.append(xy_pairs)
# Where kmeans_clusters is a list of your clusters, each containing a list of xy pairs
centroids = []
for cluster in kmeans_clusters:
if len(cluster) > 1:
# Create a convex hull, find the centroid
convex_hull = MultiPoint(cluster).convex_hull
centroid = convex_hull.centroid
# Unpack to tuple object
centroids.append(list(centroid.coords)[0])
else:
# Single point cluster, it is the centroid
centroids.append(cluster[0])
print(centroids)

Related

Clustering near Lines using coordinates in Python

I have a list with x- and y-coordinates of start and Endpoints of some lines.Lines as csv
331,178,486,232
185,215,386,308
172,343,334,419
406,128,570,165
306,106,569,166
159,210,379,299
236,143,526,248
303,83,516,178
409,62,572,106
26,287,372,427
31,288,271,381
193,228,432,330
120,196,432,329
136,200,374,297
111,189,336,289
284,186,560,249
333,202,577,254
229,194,522,219
349,111,553,165
121,322,342,416
78,303,285,391
103,315,340,415
The lines look like this on my example image. Lines plotted
I want to group lines which are close to each other into clusters and create one line for each cluster. For this example i would like to have 5 clusters. After that i want to calculate the distance from each clusterline to the next.
import csv, math
file = open("lines.csv")
csvreader = csv.reader(file)
lines = []
for data in csvreader:
lines.append({'x1':int(data[0]), 'y1':int(data[1]), 'x2':int(data[2]), 'y2':int(data[3])})
def point_delta(p1, p2):
return abs(p1 - p2)
for line in lines[:2]:
for line_rev in lines:
#x_start_delta = abs(line['x1'] - line_rev['x1'])
x_start_delta = point_delta(line['x1'], line_rev['x1'])
y_start_delta = abs(line['y1'] - line_rev['y1'])
start_distance = math.sqrt(x_start_delta**2 + y_start_delta**2)
x_end_delta = abs(line['x2'] - line_rev['x2'])
y_end_delta = abs(line['y2'] - line_rev['y2'])
end_distance = math.sqrt(x_end_delta**2 + y_end_delta**2)
avg_distance = (start_distance + end_distance)/2
cluster = 0
if avg_distance < 100:
print(f"distance: {avg_distance}")
print("############## next line ##############")
I have written some code to calculate the distance between each line but cant find a way to save the lines which are near to each other in different lists.
Does somebody know how to do this or is there another way to create clusters? Im also thinking about using the middlepoint instead of the start-/endpoint

You could throw a clustering on it, but it has trouble with the lonely line at the end
data = [[331,178,486,232],
[185,215,386,308],
[172,343,334,419],
[406,128,570,165],
[306,106,569,166],
[159,210,379,299],
[236,143,526,248],
[303,83,516,178],
[409,62,572,106],
[26,287,372,427],
[31,288,271,381],
[193,228,432,330],
[120,196,432,329],
[136,200,374,297],
[111,189,336,289],
[284,186,560,249],
[333,202,577,254],
[229,194,522,219],
[349,111,553,165],
[121,322,342,416],
[78,303,285,391],
[103,315,340,415]]
import pandas as pd
import sklearn
from sklearn.cluster import MiniBatchKMeans
import numpy as np
lines = pd.DataFrame(data)
CLUSTERS = 5
X = lines.values
kmeans = MiniBatchKMeans(n_clusters=CLUSTERS,max_no_improvement=100).fit(X)
import numpy as np
import pylab as pl
from matplotlib import collections as mc
lines_segments = [ [ (l[0],l[1]),([l[2],l[3]]) ] for l in lines.values]
center_segments = [ [ (l[0],l[1]),([l[2],l[3]]) ] for l in kmeans.cluster_centers_]
line_collection = mc.LineCollection(lines_segments, linewidths=2)
centers = mc.LineCollection(center_segments, colors='red', linewidths=4, alpha=1)
fig, ax = pl.subplots()
ax.add_collection(line_collection)
ax.add_collection(centers)
ax.autoscale()
ax.margins(0.1)
You can see the centers with
kmeans.cluster_centers_

Reverse Array in a dataframe

Hi I am trying to extract data from a netCDF file, but the data is upside down. How can I reverse the database:
The data I want to extract is the height data from the (netcdf) at the points I have in the CSV file. my Data:
import numpy as np
from netCDF4 import Dataset
import matplotlib.pyplot as plt
import pandas as pd
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Path, PathPatch
csv_data = np.loadtxt('CSV with target coordinates',skiprows=1,delimiter=',')
num_el = csv_data[:,0]
lat = csv_data[:,1]
lon = csv_data[:,2]
value = csv_data[:,3]
data = Dataset("elevation Data",'r')
lon_range = data.variables['x_range'][:]
lat_range = data.variables['y_range'][:]
topo_range = data.variables['z_range'][:]
spacing = data.variables['spacing'][:]
dimension = data.variables['dimension'][:]
z = data.variables['z'][:]
lon_num = dimension[0]
lat_num = dimension[1]
etopo_lon = np.linspace(lon_range[0],lon_range[1],dimension[0])
etopo_lat = np.linspace(lat_range[0],lat_range[1],dimension[1])
topo = np.reshape(z, (lat_num, lon_num))
height = np.empty_like(num_el)
desired_lat_idx = np.empty_like(num_el)
desired_lon_idx = np.empty_like(num_el)
for i in range(len(num_el)):
tmp_lat = np.abs(etopo_lat - lat[i]).argmin()
tmp_lon = np.abs(etopo_lon - lon[i]).argmin()
desired_lat_idx[i] = tmp_lat
desired_lon_idx[i] = tmp_lon
height[i] = topo[tmp_lat,tmp_lon]
height[height<-10]=0
print(len(desired_lat_idx))
print(len(desired_lon_idx))
print(len(height))
dfl= pd.DataFrame({
'Latitude' : lat.reshape(-1),
'Longitude': lon.reshape(-1),
'Altitude': height.reshape(-1)
});
print(dfl)
# but the Lat should not be changed here (the dfl must be correct)
df =dfl
lat=np.array(df['Latitude'])
lon=np.array(df['Longitude'])
val=np.array(df['Altitude'])
m = basemap.Basemap(projection='robin', lon_0=0, lat_0=0, resolution='l',area_thresh=1000)
m.drawcoastlines(color = 'black')
x,y = m(lon,lat)
colormesh= m.contourf(x,y,val,100, tri=True, cmap = 'terrain')
plt.colorbar(location='bottom',pad=0.04,fraction=0.06)
plt.show()
I have already tried:
lat = csv_data[:,1]
lat= lat*(-1)
But this didn´t work

It's a plotting artifact().
Just do:
colormesh= m.contourf(x,y[::-1],val,100, tri=True, cmap = 'terrain')
y[::-1] will reverse the order of the y latitude elements (as opposed to the land-mass outlines; and while keeping the x longitude coordinates the same) and hence flip them.
I've often had this problem with plotting numpy image data in the past.
Your raw CSV data are unlikely to be flipped themselves (why would they be?). You should try sanity-checking them [I am not a domain expert I'm afraid]! Overlaying an actual coordinate grid can help with this.
Another way to do it is given here: Reverse Y-Axis in PyPlot
You could also therefore just do
ax = plt.gca()
ax.invert_yaxis()

DBSCAN Clustering loop

Using DB scan I am iterating through a csv with x, y, z, and id data. I would like to generate a new csv for a every combination of eps and minimum samples that occur within a set range.
The output csv should also include the number of points in the cluster, eps value, and minimum samples used as columns along with the original x, y, z, and id data. The code below will do this, but will only create a csv for the last cluster calculated.
%matplotlib notebook
from interp import *
import pandas as pd
import pandas as pdfrom sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
import itertools
points_file = "examples/example-1/fcr.csv"
eps_values = np.arange(0.1,0.5,0.1)
min_samples = np.arange(5,20)
dbscan_params = list(itertools.product(eps_values, min_samples))
cluster_n = []
epsvalues = []
min_samp = []
for p in dbscan_params:
dbscan_cluster = DBSCAN(eps=p[0],
min_samples=p[1]).fit(points)
epsvalues.append(p[0])
min_samp.append(p[1]), cluster_n.append(
len(np.unique(dbscan_cluster.labels_)))
df = pd.read_csv(points_file, header=None, names=["x", "y", "z", "id"])
df["cluster"] = clustering.labels_
df["eps"] = eps
df["min_sample"] = min_sam
csv_name = f'fcr_{eps}e{min_sam}m.csv'
df.to_csv(csv_name, index=False)

Convert DataFrame to a multi polygon DataFrame, multiple data point - python

I have a DataFrame as below, I want to convert data to a multi polygon DataFrame, because I want to plot each multi polygon on a map.
I know how to convert if I have two data point, but with 6 data point, I don't know how to convert it. can anyone help me please.
geometry = [Point(xy) for xy in zip(neightrip_counts_.lan0, neightrip_counts_.long0)]
geometry
#neightrip_counts_.lan1, neightrip_counts_.long1,neightrip_counts_.lan2, neightrip_counts_.long2
lan0 long0 lan1 long1 lan2 long2
0 59.915667 10.777567 59.916738 10.779916 59.914943 10.773977
1 59.929853 10.711515 59.929435 10.713682 59.927596 10.710033
2 59.939230 10.759170 59.937205 10.760581 59.943750 10.760306
3 59.912520 10.762240 59.911594 10.761774 59.912347 10.763815
4 59.929634 10.732839 59.927140 10.730981 59.931081 10.736003

Let me rename the dataframe neightrip_counts_ as df for brevity. Here is the relevant code that will create a polygon for each row of dataframe.
df['geometry'] = [Polygon([(z[0],z[1]), (z[2],z[3]), (z[4],z[5])]) for z in zip(df.long0, df.lan0, df.long1, df.lan1, df.long2, df.lan2)]
gpdf = df.set_geometry("geometry", drop=True)
gpdf.plot()
By the way, you must be careful about the sequence of (long, lat).
start_coords = [ gdf.centroid[0].x, gdf.centroid[0].y] # is wrong
Use this in stead.
start_coords = [ gdf.centroid[0].y, gdf.centroid[0].x]
Edit
For the benefits of the readers, here is the complete runnable code:
import pandas as pd
import geopandas as gpd
from io import StringIO
from shapely.geometry import Polygon, Point, LineString
import numpy as np
import folium
data1 = """index lan0 long0 lan1 long1 lan2 long2
0 59.915667 10.777567 59.916738 10.779916 59.914943 10.773977
1 59.929853 10.711515 59.929435 10.713682 59.927596 10.710033
2 59.939230 10.759170 59.937205 10.760581 59.943750 10.760306
3 59.912520 10.762240 59.911594 10.761774 59.912347 10.763815
4 59.929634 10.732839 59.927140 10.730981 59.931081 10.736003"""
# read/parse data into dataframe
df0 = pd.read_csv(StringIO(data1), sep='\s+', index_col='index')
# create `geometry` column
df0['geometry'] = [Polygon([(xy[0],xy[1]), (xy[2],xy[3]), (xy[4],xy[5])]) \
for xy in zip(df0.long0, df0.lan0, df0.long1, df0.lan1, df0.long2, df0.lan2)]
# set geometry
gpdf = df0.set_geometry("geometry", drop=True)
# do check plot. (uncomment next line)
#gpdf.plot()
# make geojson
center_pt = gpdf.centroid[0].y, gpdf.centroid[0].x
gdf_json = gpdf.to_json()
# plot the geojson on the folium webmap
webmap = folium.Map(location = center_pt, zoom_start = 13, min_zoom = 3)
folium.GeoJson(gdf_json, name='data_layer_1').add_to(webmap)
# this opens the webmap
webmap
Output screen capture (of interactive webmap):

Try this, assuming the 'lan' is latitude.
import geopandas as gpd
from shapely.geometry import Polygon
import numpy as np
import pandas as pd
import folium
# ....
def addpolygeom(row):
row_array = np.array(row)
# split dataframe row to a list of tuples (lat, lon)
coords = [tuple(i)[::-1] for i in np.split(row_array, range(2, row_array.shape[0], 2))]
polygon = Polygon(coords)
return polygon
# Convert points to shapely geometry
neightrip_counts_['geometry'] = neightrip_counts_.apply(lambda x: addpolygeom(x), axis=1)
# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(neightrip_counts_, geometry='geometry')
start_coords = [ gdf.centroid[0].y, gdf.centroid[0].x]
gdf_json = gdf.to_json()
map = folium.Map(start_coords, zoom_start=4)
folium.GeoJson(gdf_json, name='mypolygons').add_to(map)

Create a weighted adjacency list from an alphanumeric edgelist in Python

I've been working on this dataset of protein-protein interactions. I have the edgelist in the following format:
AIG676464 AIG8475985 0.00035. Protein 1, Protein 2, weight.
I've tried several methods and can't get it to output the matrix. What I am hoping to get is the matrix form of the interactions. Any help would be greatly appreciated. Python or R is fine.
I've tried networkx:
import networkx as nx
fh = open("InWeb29.txt", 'rb')
#d = fh.write(textline)
#fh.close()
G = nx.read_edgelist(fh)
G = nx.Graph([()])
A = nx.adjacency_matrix(G)
print(A.todense())
A.setdiag(A.diagonal()*2)
print(A.todense())
Here is my other code so far:
import csv
import pandas as pd
"Load in data file"
"""Read in the data file"""
df = pd.read_csv("datafile.txt", sep= '\t', header=0)
headers = list(df)
prot1 = df[df.columns[0]]
prot2 = df[df.columns[1]]
weight = df[df.columns[2]]
print prot1
with open("datafile.txt") as f:
next(f)
data = [tuple(map(str,row)) for row in csv.reader(f)]
n = max(max(prot1, prot2) for prot1, prot2, weight in data)
matrix = [[None]* n for i in range(n)]
for prot1, prot2 in data:
matrix[prot1][prot2]= weight
for row in matrix:
print(row)

It NetworkX you can use read_weighted_edgelist
import networkx as nx
import StringIO
s = StringIO.StringIO("AIG676464 AIG8475985 0.00035")
G = nx.read_weighted_edgelist(s)
A = nx.adjacency_matrix(G)
print A.todense()
Output
[[ 0. 0.00035]
[ 0.00035 0. ]]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python: k-means clustering - python

Related

Clustering near Lines using coordinates in Python

Reverse Array in a dataframe

DBSCAN Clustering loop

Convert DataFrame to a multi polygon DataFrame, multiple data point - python

Create a weighted adjacency list from an alphanumeric edgelist in Python

Categories

Resources