I necessary need a function to delete links from my oldText column (more then 1000 rows) in a pandas DataFrame.
I've created it using regex, but it doesn't work. This is my code:
def remove_links(text):
text = re.sub(r'http\S+', '', text)
text = text.strip('[link]')
return text
df['newText'] = df['oldText'].apply(remove_links)
I have not error, the code do just nothing
Your code is working for me:
CSV:
oldText
https://abc.xy/oldText asd
https://abc.xy/oldTe asd
https://abc.xy/oldT
https://abc.xy/old
https://abc.xy/ol
Code:
import pandas as pd
import re
def remove_links(text):
text = re.sub(r'http\S+', '', text)
text = text.strip('[link]')
return text
df = pd.read_csv('test2.csv')
df['newText'] = df['oldText'].apply(remove_links)
print(df)
Result:
oldText newText
0 https://abc.xy/oldText asd asd
1 https://abc.xy/oldTe asd asd
2 https://abc.xy/oldT
3 https://abc.xy/old
4 https://abc.xy/ol
Related
I have a dataset in the following format
text author title
-------------------------------------
dt = text0 author0 title0
text1 author1 title1
. . .
. . .
. . .
and I would like to create different separate datasets which contain only texts of one author. For example the dataset names dt1 contains the texts of the author1, the dt2 contains texts of the author2, etc.
I would be grateful if you could help me with this using python.
Update:
dt =
text author title
-------------------------------------------------------------------------
0 I would like to go to the beach George Beach
1 I was in park few days ago Nick Park
2 I would like to go in uni Peter University
3 I have be in the airport at 8 Maria Airport
Please try, this is what I understand you require.
import pandas as pd
data = {
'text' : ['text0', 'text1', 'text2'],
'author': ['author0', 'author1', 'author1'],
'title': ['Comunicación', 'Administración', 'Ventas']
}
df = pd.DataFrame(data)
df1 = df[df["author"]=="author0"]
df2 = df[df["author"]=="author1"]
print(df1)
print(df2)
Update:
import pandas as pd
data = {
'text' : ['text0', 'text1', 'text2'],
'author': ['author0', 'author1', 'author1'],
'title': ['Comunicación', 'Administración', 'Ventas']
}
df = pd.DataFrame(data)
df1 = df[df["author"]=="author0"]
df2 = df[df["author"]=="author1"]
list_author = df['author'].unique().tolist()
for x in list_author:
a = df[df["author"]==x]
print(a)
I'm trying to remove the special characters with the re.sub() function, but when I use the re.sub() function my replace function stops working.
My code:
import re
import pandas as pd
from IPython.display import display
tabela = pd.read_excel("tst.xlsx")
(tabela[['nome', 'mensagem', 'arquivo', 'telefone']])
for linha in tabela.index:
nome = tabela.loc[linha, "nome"]
mensagem = tabela.loc[linha, "mensagem"]
acordo = tabela.loc[linha, "acordo"]
telefone = tabela.loc[linha, "telefone"]
texto = mensagem.replace("fulano", nome)
texto = texto.replace( "value", acordo)
texto = texto.replace( "phone", telefone)
texto = re.sub(r"[!!##$%¨&*()_?',;.]", '', telefone)
print(texto)
Print result:
11
How it should come out:
thyago
R$200
11
Try this:
repalce
texto = re.sub(r"[!!##$%¨&*()_?',;.]", '', telefone)
to
texto = re.sub(r"[!!##$%¨&*()_?',;.]", '', texto)
Loops should be rarely used in Pandas.
Try this:
tabela['telefone'] = tabela['telefone'].str.replace(r'[!!##$%¨&*()_?\',;.]', '', regex=True)
tabela['mensagem']= tabela.apply(lambda x: x['mensagem'].replace('fulano', str(x['nome'])), axis=1)
tabela['mensagem']= tabela.apply(lambda x: x['mensagem'].replace('value', str(x['acordo'])), axis=1)
tabela['mensagem']= tabela.apply(lambda x: x['mensagem'].replace('phone', str(x['telefone'])), axis=1)
I have a code:
def convert_df_to_xml(df,fd,ld):
# sukuriam pagrindini elementa (root) su pavadinimu Invoices.
root = ET.Element("Invoices")
root.set("from", str(fd))
root.set("till", str(ld))
for i in range(len(df['partner_id'])):
# pridedam sub elementa.
invoices = ET.SubElement(root, "Invoice")
invoices.set('clientid',df['company_registry'][i])
invoices.set('imones_pavadinimas', df['partner_id'][i])
# pridedam sub-sub elementa.
quebec = ET.SubElement(invoices, "Product")
# susikraunam eiluciu info is dataframe
sectin_1 = ET.SubElement(quebec, "Name")
sectin_1.text = str(df["Name"][i])
sectin_2 = ET.SubElement(quebec, 'Quantity')
sectin_2.text = str(df["time_dif"][i])
sectin_3 = ET.SubElement(quebec, 'Price')
sectin_3.text = str(df["price_unit"][i])
xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ", encoding="UTF-8").decode("UTF-8")
with open("bandomasis_itp_xml_failas_V_1.1.xml", "w") as f:
f.write(xmlstr)
I'm creating xml file from python DataFrame. The problem is that in xml file I got "?" marks instead "ė" character.
In dataframe i have strings with characters "ė,ą,š,ų" and I need them to be in xml file.
My dataframe:
df1 = pd.DataFrame({'partner_id': ['MED GRUPĖ, UAB'], 'Name':['Pirmas'], 'company_registry': ['3432543'],
'time_dif':['2'],'price_unit':['23']})
what is the problem with encoding here?
I have txt file like this;
name lastname 17 189cm
How do I get it to be like this?
name lastname, 17, 189cm
Using str.strip and str.split:
>>> my_string = 'name lastname 17 189cm'
>>> s = list(map(str.strip, my_string.split()))
>>> ', '.join([' '.join(s[:2]), *s[2:] ])
'name lastname, 17, 189cm'
You can use regex to replace multiple spaces (or tabs) with a comma:
import re
text = 'name lastname 17 189cm'
re.sub(r'\s\s+|\t', ', ', text)
text = 'name lastname 17 189cm'
out = ', '.join(text.rsplit(maxsplit=2)) # if sep is not provided then any consecutive whitespace is a separator
print(out) # name lastname, 17, 189cm
You could use re.sub:
import re
s = "name lastname 17 189cm"
re.sub("[ ]{2,}",", ", s)
PS: for the first problem you proposed, I had the following solution:
s = "name lastname 17 189cm"
s[::-1].replace(" ",",", 2)[::-1]
I'm trying to load CSV file into pandas dataframe. CSV is semicolon delimited. Values in text columns are in double quotation marks.
File in question: https://www.dropbox.com/s/1xv391gebjzmmco/file_01.csv?dl=0
In one of the text columns ('TYTUL') i have following value:
"00 307 1457 212"
I specify the column as str but when i print or export results to excel I get
003071457212
instead of
00 307 1457 212
How do I prevent pandas from removing spaces?
Here is my code:
import pandas
df = pandas.read_csv(r'file_01.csv'
,sep = ';'
,quotechar = '"'
,names = ['DATA_OPERACJI'
,'DATA_KSIEGOWANIA'
,'OPIS_OPERACJI'
,'TYTUL'
,'NADAWCA_ODBIORCA'
,'NUMER_KONTA'
,'KWOTA'
,'SALDO_PO_OPERACJI'
,'KOLUMNA_9']
,usecols = [0,1,2,3,4,5,6,7]
,skiprows = 38
,skipfooter = 3
,encoding = 'cp1250'
,thousands = ' '
,decimal = ','
,parse_dates = [0,1]
,converters = {'OPIS_OPERACJI': str
,'TYTUL': str
,'NADAWCA_ODBIORCA': str
,'NUMER_KONTA': str}
,engine = 'python'
)
df.TYTUL.replace([' +', '^ +', ' +$'], [' ', '', ''],regex=True,inplace=True) #this only removes excessive spaces
print(df.TYTUL)
I also came up with a workaround (comment #workaround) but I would like to ask if there is a better way.
import pandas
df = pandas.read_csv(r'file_01.csv'
,sep = ';'
,quotechar = '?' #workaround
,names = ['DATA_OPERACJI'
,'DATA_KSIEGOWANIA'
,'OPIS_OPERACJI'
,'TYTUL'
,'NADAWCA_ODBIORCA'
,'NUMER_KONTA'
,'KWOTA'
,'SALDO_PO_OPERACJI'
,'KOLUMNA_9']
,usecols = [0,1,2,3,4,5,6,7]
,skiprows = 38
,skipfooter = 3
,encoding = 'cp1250'
,thousands = ' '
,decimal = ','
,parse_dates = [0,1]
,converters = {'OPIS_OPERACJI': str
,'TYTUL': str
,'NADAWCA_ODBIORCA': str
,'NUMER_KONTA': str}
,engine = 'python'
)
df.TYTUL.replace([' +', '^ +', ' +$'], [' ', '', ''],regex=True,inplace=True) #this only removes excessive spaces
df.TYTUL.replace(['^"', '"$'], ['', ''],regex=True,inplace=True) #workaround
print(df.TYTUL)
remove this line from your read_csv code
,thousands = ' '
I tested it, the output is correct without this option
'00 307 1457 212'