Pandas - Split column entry (each other seperator) - python

I have a pandas data frame that looks something like this
| id | name | latlon |
0 sat -28,14 | -23, 12 | -21, 13...
the latlon column entry contains multiple latitude/longitude entries, seperated with the | symbol, I need to split them into a list as follows: lat = [-28,-23,-21] lon = [14,12,13]
running the following command will create a list of all the values
sat_df["latlon"]= sat_df["latlon"].str.split("|", expand=False)
example:indexnumber [-58.562242560404705,52.82662430990185, -61.300361184039964,64.0645716165538, -62.8683906074927,76.96557954998904, -63.078154849236505,90.49660509514713, -61.95530287454162,103.39930010176977, -59.727998547544765,114.629246065411, -56.63116878989326,124.07501384844198, -52.9408690779807,131.75498199669985, -48.85803704806645,137.9821558270659, -44.56621244973711,143.03546934613863, -40.08092215592037,147.27807367743728, -35.5075351924213,150.86679792543603,]
how can I continue to split the data, so each other entry is assgined to the lat/lon list respectivley, for the entire dataframe. Alternativley, is there some way to create two columns (lat/lon) which both hold a list object with all the values?
EDIT:
import pandas as pd
sat_df = pd.DataFrame({'卫星编号': {0: 38858, 1: 5, 2: 16}, 'path': {0: '-2023240,1636954,-1409847|-2120945,1594435,-1311586|-2213791,1547970,-1209918|', 1: '8847,-974294,-168045|69303,-972089,-207786|129332,-963859,-246237|189050,-949637,-283483|', 2: '283880,751564,538726|214030,782804,550729|142133,808810,558964|69271,829348,563411|'}, 'latlon': {0: '-28.566504816706743,-58.42623323318429|-26.424915546197877,-58.03051668423269|-24.24957760771616,-57.709052434729294|-22.049419348341488,-57.45429550739338|-19.82765114196696,-57.258197633964414|-17.58719794818057,-57.113255687570714|-15.33074070109176,-57.01245109909582|-13.060755383916138,-56.949188922655416|-10.779548173615462,-56.91723753411087|-8.48928513939462,-56.910669632641685|-6.192021225701933,-56.92380598464241|-3.8897270110140494,-56.951159278680606|-1.5843114029280712,-56.987381318629815|0.7223533959819478,-57.02721062232328|3.028411197431552,-57.06542107180802|5.331999106238248,-57.09677071391785|7.631224662503422,-57.115951252231326|9.924144733525859,-57.11753523668981|12.20873984934678,-57.09592379302077|14.482890506579363,-57.045292032888945|16.744349099342163,-56.95953284633186|18.99070929829218,-56.83219872719919|', 1: '-9.826016080133869,71.12640824438319|-12.077961267269185,74.17040194928683|-14.251942328865088,77.22102880126546|-16.362232784638383,80.31943171515469|-18.372371674164317,83.43158582640798|-20.311489634835258,86.62273098947678|-22.14461262803909,89.85609377674561|-23.896490600856566,93.19765633031801|-25.53339979617313,96.60696767976263|-27.063070616439813,100.12254137641649|-28.488648081761962,103.78528610926675|-29.778331008010497,107.54645547637602|-30.942622037767002,111.47495996053523|-31.95152016226762,115.51397654947516|-32.80866797590735,119.73211812295206|-33.486858278098815,124.06227007574186|-33.98257678066123,128.57116785317814|-34.27304876808886,133.17990028392123|-34.34804732039687,137.91355482600457|-34.19053759979979,142.79776551711302|-33.788689805715364,147.73758823197466|-33.12248489727676,152.7937677542324|', 2: '34.00069374375586,-130.03583418452314|34.3070000099521,-125.16691893340256|34.37547230320849,-120.37930544344802|34.219644836708575,-115.72548686095767|33.8599777210809,-111.25048787484094|33.307236654159695,-106.89130089454063|32.579218893589676,-102.68672977394559|31.69071108398145,-98.63657044455137|30.663892680279847,-94.76720076317056|29.49498481622457,-91.01231662520239|28.20247456939903,-87.39472628213446|26.796048279088225,-83.90476041381801|25.29620394685256,-80.5572008057606|23.686627724590036,-77.28791855670698|21.984668849769005,-74.1108962902788|20.209508481020038,-71.0367205896831|18.337433788359615,-68.00383542959851|16.385207987194672,-65.02251732177939|14.355346635752394,-62.078279068092414|12.266387624465171,-59.17870114389838|10.087160866120724,-56.262880710180255|7.8348695447113235,-53.336971029542006|'}})
#splits latlon data into a list
sat_df.dropna(inplace=True)
sat_df["latlon"]= sat_df["latlon"].str.split("|", expand=False)
sat_df
#need to write each entries latlon list as two lists (alternating lat and lon)
lat = []
lon = []
#for sat_df["latlon"]:

lets go a step back from your str.strip and make use of explode which was added in pandas 0.25
then merge it back based on the index.
df = sat_df['latlon'].str.split('|').explode().str.split(',',expand=True)
new_df = pd.merge(sat_df.drop('latlon',axis=1),
df,left_index=True,
right_index=True).rename(columns={0 : 'Lat', 1 : 'Lon'})
print(new_df.drop('path',axis=1))
卫星编号 Lat Lon
0 38858 -28.566504816706743 -58.42623323318429
0 38858 -26.424915546197877 -58.03051668423269
0 38858 -24.24957760771616 -57.709052434729294
0 38858 -22.049419348341488 -57.45429550739338
0 38858 -19.82765114196696 -57.258197633964414
.. ... ... ...
2 16 14.355346635752394 -62.078279068092414
2 16 12.266387624465171 -59.17870114389838
2 16 10.087160866120724 -56.262880710180255
2 16 7.8348695447113235 -53.336971029542006
2 16 None

For this purpose we are using pandas library.
Initially I have created a dataframe as you have mentioned.
Code:
import pandas as pd
latlon = [-58.562242560404705,52.82662430990185, -61.300361184039964,64.0645716165538, -62.8683906074927,76.96557954998904, -63.078154849236505,90.49660509514713, -61.95530287454162,103.39930010176977, -59.727998547544765,114.629246065411, -56.63116878989326,124.07501384844198, -52.9408690779807,131.75498199669985, -48.85803704806645,137.9821558270659, -44.56621244973711,143.03546934613863, -40.08092215592037,147.27807367743728, -35.5075351924213,150.86679792543603,]
# print(latlon)
data = pd.DataFrame({'id':[0],'name':['sat'],'latlon':[latlon]})
print(data)
Output:
id name latlon
0 0 sat [-58.562242560404705, 52.82662430990185, -61.3...
Now I've converted the latlon to string in order to iterate because if you try to iterate float value you may get error. Then we are passing the lattitude and longitude values to corresponding columns of the dataframe.
This code will work even if you have more any number of records or rows in your dataframe.
Code:
#splittint latlon and making adding the values to lat and lon columns
lats = []
lons = []
for i in range(len(data)):
lat_lon = [str(x) for x in (data['latlon'].tolist()[i])]
lat = []
lon = []
for i in range(len(lat_lon)):
if i%2==0:
lat.append(float(lat_lon[i]))
else:
lon.append(float(lat_lon[i]))
lats.append(lat)
lons.append(lon)
data = data.drop('latlon',axis=1) #dropping latlon column
data.insert(2,'lat',lats) #adding lat column
data.insert(3,'lon',lons) #adding lon column
# print(data)
data #displaying dataframe
Output:
id name lat lon
0 0 sat [-58.562242560404705, -61.300361184039964, -62... [52.82662430990185, 64.0645716165538, 76.96557...
I hope it would be helpful.

Related

Getting data into a map

I got my .dat data formatted into arrays I could use in graphs and whatnot.
I got my data from this website and it requires an account if you want to download it yourself. The data will still be provided below, however.
https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=1028
data in python:
import pandas as pd
df = pd.read_csv("ocean_flux_co2_2d.dat", header=None)
print(df.head())
0 1 2 3
0 -178.75 -77.0 0.000003 32128.7
1 -176.25 -77.0 0.000599 32128.7
2 -173.75 -77.0 0.001649 39113.5
3 -171.25 -77.0 0.003838 58934.0
4 -168.75 -77.0 0.007192 179959.0
I then decided to put this data into arrays that could be put into graphs and other functions.
Like so:
lat = []
lon = []
sed = []
area = []
with open('/home/srowpie/SrowFinProj/Datas/ocean_flux_tss_2d.dat') as f:
for line in f:
parts = line.split(',')
lat.append(float(parts[0]))
lon.append(float(parts[1]))
sed.append(float(parts[2]))
area.append(float(parts[3]))
lat = np.array(lat)
lon = np.array(lon)
sed = np.array(sed)
area = np.array(area)
My question now is how can I put this data into a map with data points? Column 1 is latitude, Column 2 is longitude, Column 3 is sediment flux, and Column 4 is the area covered. Or do I have to bootleg it by making a graph that takes into account the variables lat, lon, and sed?
You don't need to get the data into an array. Just apply df.values and you would have a numpy array of all the data in the dataframe.
Example -
array([[-1.78750e+02, -7.70000e+01, 3.00000e-06, 3.21287e+04],
[-1.76250e+02, -7.70000e+01, 5.99000e-04, 3.21287e+04],
[-1.73750e+02, -7.70000e+01, 1.64900e-03, 3.91135e+04],
[-1.71250e+02, -7.70000e+01, 3.83800e-03, 5.89340e+04],
[-1.68750e+02, -7.70000e+01, 7.19200e-03, 1.79959e+05]])
I'll not recommend storing individual columns as variable. Instead just set the column names for the dataframe and then use them to extract a pandas Series of the data in that column.
df.columns = ["Latitude", "Longitude", "Sediment Flux", "Area covered"]
This what the table would look like after this,
Latitude
Longitude
Sediment Flux
Area covered
0
-178.75
-77.0
3e-06
32128.7
1
-176.25
-77.0
0.000599
32128.7
2
-173.75
-77.0
0.001649
39113.5
3
-171.25
-77.0
0.003838
58934.0
4
-168.75
-77.0
0.007192
179959.0
Simply do df[column_name] to get the data in that column.
For example -> df["Latitude"]
Output -
0 -178.75
1 -176.25
2 -173.75
3 -171.25
4 -168.75
Name: Latitude, dtype: float64
Once you have done all this, you can use folium to plot the rows on real interactive maps.
import folium as fl
map = fl.Map(df.iloc[0, :2], zoom_start = 100)
for index in df.index:
row = df.loc[index, :]
fl.Marker(row[:2].values, f"{dict(row[2:])}").add_to(map)
map

How do I write a for loop within a function to pickup values within a csv?

I have a file called sampleweather100 which has Latitudes and longtidudes of addresses. If i manually type in these lats and longs under the location list function, I get the output I desire. However, I want to write a function where it pulls out the output for all rows of my csv without me manually entering it:
import pandas as pd
my_cities = pd.read_csv('sampleweather100.csv')
from wwo_hist import retrieve_hist_data
#lat = -31.967819
#lng = 115.87718
#location_list = ["-31.967819,115.87718"]
frequency=24
start_date = '11-JAN-2018'
end_date = '11-JAN-2019'
api_key = 'MyKey'
location_list = ["('sampleweather100.csv')['Lat'],('sampleweather100.csv')['Long']"]
hist_weather_data = retrieve_hist_data(api_key,
location_list,
start_date,
end_date,
frequency,
location_label = False,
export_csv = True,
store_df = True)
My function location_list = ["('sampleweather100.csv')['Lat'],('sampleweather100.csv')['Long']"] does not work. Is there a better way or a forloop that will fetch each rows lat and long into that location_list function:
Reprex of dataset:
my_cities
Out[89]:
City Lat Long
0 Lancaster 39.754545 -82.636371
1 Canton 40.851178 -81.470345
2 Edison 40.539561 -74.336307
3 East Walpole 42.160667 -71.213680
4 Dayton 39.270486 -119.577078
5 Fort Wainwright 64.825343 -147.673877
6 Crystal 45.056106 -93.350020
7 Medford 42.338916 -122.839771
8 Spring Valley 41.103816 -74.045399
9 Hillsdale 41.000879 -74.026089
10 Newyork 40.808582 -73.951553
Your way of building the list just does not make sense. You are using the filename of the csv, which is just a string and holds no reference to the file itself or the dataframe you have created from it.
Since you buildt a dataframe called my_cities from your csv using pandas, you need to extract your list of pairs from the dataframe my_cities:
location_list = [','.join([str(lat), str(lon)]) for lat, lon in zip(my_cities['Lat'], my_cities['Long'])]
This is the list you get with the above line using your sample dataframe:
['39.754545,-82.636371', '40.851178000000004,-81.470345',
'40.539561,-74.33630699999999', '42.160667,-71.21368000000001',
'39.270486,-119.577078', '64.825343,-147.673877', '45.056106,-93.35002',
'42.338916,-122.839771', '41.103815999999995,-74.045399',
'41.000879,-74.026089', '40.808582,-73.951553']
You could use one of these to covert the dataframe into a list of comma-separated pairs:
location_list = [
'{},{}'.format(lat, lon)
for i, (lat, lon) in my_cities.iterrows()
]
or
location_list = [
'{},{}'.format(lat, lon)
for lat, lon in my_cities.values
]

Matching names between two columns of two dataframes and adding new columns to one - long computing time

I have two dataframes:
df1 -> Dataframe of all german cities their names and more data.
df2 -> Dataframe of all german cities and their longitude and latitude
I wrote a function that searches for a city name in both dataframes and returns the longitude and latitude:
def ret_longlat(city_name):
if sum(df_cities["city"] == city_name) > 0:
long = df_cities["lon"][df_cities["city"] == city_name].iloc[0]
lat = df_cities["lat"][df_cities["city"] == city_name].iloc[0]
else:
long = 0
lat = 0
return long,lat
In the next step I apply this function to all city names of df1 and save the result in a new Column:
df_result["long"] = df_result["city_names"].apply(lambda x: ret_longlat(x)[0])
df_result["lat"] = df_result["city_names"].apply(lambda x: ret_longlat(x)[1])
This whole process takes relatively long (I'd say 5 minutes for 12162 rows).
Is there a way to improve the code?
Example Data:
df1
city
1 stadtA
2 stadtB
3 stadtu
4 stadty
5 stadtX
df2
city lat lon
14 stadtD 50.611879 12.135526
24 stadtA 48.698890 9.842890
25 stadtC 52.947222 12.849444
26 stadtB 52.867370 12.813750
27 stadtY 52.985000 12.854444
This is a merge problem. You can perform a left merge and then fill missing values:
res = pd.merge(df1.rename(columns={'city_names': 'city'}),
df2[['city', 'long', 'lat']].drop_duplicates('city'),
how='left', on='city')
res[['long', 'lat']] = res[['long', 'lat']].fillna(0)

Accessing Binned Data with pandas

I have a set of data for which i have put into a data frame and then binned:
print(data1)
[[-1.90658883e+00 5.66881290e-01 1.45443907e+00]
[-1.82926850e+00 2.53325112e-01 1.45480072e+00]
[-1.59073925e+00 5.33264011e-01 1.45461954e+00]
...
[ 2.86246982e+02 4.52961148e-01 6.19121328e+00]]
df = pd.DataFrame(data=data1,)
print(df)
bins = [0,50,100,150,200,250,300,400]
df1 = pd.cut(df[0],bins, labels = False)
print(df1)
1 0
2 0
..
500 4
501 4
502 5
0 through 5 are the bin labels. I want to be able to access the data in each bin/category and store it in a variable. Something like this:
x = df1(4) # this doesnt work, just an example.
^ meaning I want to access the data stored in the 4th bin in the pandas dataframe and assign it to the variable x as an array, but I am unsure how to do that.
You can use pandas.DataFrame.loc and pass a boolean array to it.
bi = pd.cut(df[0], bins, labels=False)
x = df.loc[bi == 4]

Convert lat/long coordinates in a pandas Series to list of lists

I have a column in pandas called 'coords'. It has multiple comma delimited longitude + 'space' + latitude values in each row.
A sample row for the 'coords' column would appear like below...
[-88.12166374975578 42.13019789209025, -88.12166297898594 42.130077282796826, -88.12166229779616 42.12997073740438, -88.12165682902426 42.129114208546525, -88.12165440666122 42.12867029753218]
I would like to create a list of lists from the list. So that it would appear like this...
[[-88.12166374975578, 42.13019789209025], [-88.12166297898594 ,42.130077282796826], [-88.12166229779616, 42.12997073740438], [-88.12165682902426,42.129114208546525], [-88.12165440666122, 42.12867029753218]]
How can I convert df['coords'] to the list of lists?
Here is a head()...
coords
0 -88.12166374975578 42.13019789209025, -88.12166297898594 42.130077282796826, -88.12166229779616 42.12997073740438, -88.12165682902426 42.129114208546525, -88.12165440666122 42.12867029753218, -88.12165409167278 42.12861210461891, -88.12165078955562 42.1280072560737, -88.1216505237599 42.127958648542936, -88.12164976861018 42.127820070569165, -88.12164950156834 42.127770730347784, -88.12164936198349 42.127745113495685, -88.12164631909246 42.12698047923614, -88.12164465148149 42.126561239318384, -88.12164441208937 42.126501380826646, -88.12165535387125 42.125918676152615, -88.12165901489989 42.1257236125411, -88.12166910482216 42.125179681003004, -88.12167046792653 42.12511347549821, -88.12168153859359 42.124574951678966, -88.12169213266428 42.12405994975595, -88.12169609920953 42.123867...
1 -88.15806483536268 42.15423929791892, -88.15734814434225 42.15424023425998, -88.15692561771552 42.15424078182948, -88.15612280604331 42.15424182229812, -88.15570230201315 42.154247060953836, -88.15537304882349 42.15424548051985, -88.15424894139665 42.15424008174756, -88.15312432528388 42.15423466567452, -88.15200516375596 42.15422926640768, -88.15075402101326 42.1542232181898, -88.15074137162432 42.15422315689777, -88.15073738857417 42.15384470168878, -88.1507388608806 42.15329655518857, -88.15074017125366 42.15246856985761, -88.15074053615406 42.15224538180373, -88.15074152744889 42.151633597914206, -88.15074252669456 42.15055197422978, -88.15074334980639 42.15033614385567, -88.15074448165737 42.15003982848825, -88.15074567060333 42.14972749019171, -88.15074611950101 42.14952766024307...
Assuming what you showed is an excerpt of the Coords column, you can use pd.Series.str.split:
coords = df.Coords
print(coords)
0 -88.12166374975578 42.13019789209025
1 -88.12166297898594 42.130077282796826
2 -88.12166229779616 42.12997073740438
3 -88.12165682902426 42.129114208546525
4 -88.12165440666122 42.12867029753218
dtype: object
list_ = coords.str.split(expand=True).applymap(float).values.tolist()
print(list_)
[[-88.12166374975578, 42.13019789209025],
[-88.12166297898594, 42.130077282796826],
[-88.12166229779616, 42.12997073740438],
[-88.12165682902426, 42.129114208546525],
[-88.12165440666122, 42.12867029753218]]
Edited solution:
print(coords)
coords
0 -88.12166374975578 42.13019789209025, -88.1216...
1 -88.15806483536268 42.15423929791892, -88.1573...
out = df.coords.str.split(',\s+').apply(pd.Series).stack()\
.str.split(expand=True).applymap(float).values.tolist()
print(out)
[[-88.12166374975578, 42.13019789209025],
[-88.12166297898594, 42.130077282796826],
[-88.12166229779616, 42.12997073740438],
[-88.12165682902426, 42.129114208546525],
[-88.12165440666122, 42.12867029753218],
[-88.12165409167278, 42.12861210461891],
[-88.12165078955562, 42.1280072560737],
[-88.1216505237599, 42.127958648542936],
[-88.12164976861018, 42.127820070569165],
[-88.12164950156834, 42.127770730347784],
[-88.12164936198349, 42.127745113495685],
[-88.12164631909246, 42.12698047923614],
[-88.12164465148149, 42.126561239318384],
[-88.12164441208937, 42.126501380826646],
[-88.12165535387125, 42.125918676152615],
[-88.12165901489989, 42.1257236125411],
[-88.12166910482216, 42.125179681003004],
[-88.12167046792653, 42.12511347549821],
[-88.12168153859359, 42.124574951678966],
[-88.12169213266428, 42.12405994975595],
[-88.12169609920953, 42.123867],
[-88.15806483536268, 42.15423929791892],
[-88.15734814434225, 42.15424023425998],
[-88.15692561771552, 42.15424078182948],
[-88.15612280604331, 42.15424182229812],
[-88.15570230201315, 42.154247060953836],
[-88.15537304882349, 42.15424548051985],
[-88.15424894139665, 42.15424008174756],
[-88.15312432528388, 42.15423466567452],
[-88.15200516375596, 42.15422926640768],
[-88.15075402101326, 42.1542232181898],
[-88.15074137162432, 42.15422315689777],
[-88.15073738857417, 42.15384470168878],
[-88.1507388608806, 42.15329655518857],
[-88.15074017125366, 42.15246856985761],
[-88.15074053615406, 42.15224538180373],
[-88.15074152744889, 42.151633597914206],
[-88.15074252669456, 42.15055197422978],
[-88.15074334980639, 42.15033614385567],
[-88.15074448165737, 42.15003982848825],
[-88.15074567060333, 42.14972749019171],
[-88.15074611950101, 42.14952766024307]]

Categories