issues in extracting data from a csv - python
class QuotesSpider(scrapy.Spider):
name = "googlemailverif"
with open('input.csv', "r") as csvfile:
datareader = csv.reader(csvfile)
start_urls=['https://www.google.fr/search?q=email'+str(row[2]) for row in datareader]
# starting parsing
def parse(self, response):
yield {
'url': response.url,
'nom': "nom",
'emails': re.findall(r"[a-zA-Z0-9_\.+-]+#[a-zA-Z0-9_\.+-]+\.[a-zA-Z]{2,6}",''.join(response.xpath("//body//text()").extract()).strip()),
'SIRET':"SIRET",
}
This is a code that try from a csv file (with in 3rd column to extract a name of a company) to check for emails on google.
The first column contains an information i want to extract in the csv as "SIRET".
How can I do it?
If i extract it in start_urls when reading the csv, my url will be bad. If I use it it parse I will not : have the good data related to the data parsed, and I may have an error because accessing a file 2 times.
How can I make the information out of the first reading going to SIRET in the parse function?
I am struggling for hours on it :(
Best,
We can use zip for this.
sirets, start_urls = zip(*[(row[0], 'https://www.google.fr/search?q=email'+str(row[2])) for row in datareader])
Now you have one list containing the SIRET values and another list containing urls
"SIRET","NIC","L1_NORMALISEE","L2_NORMALISEE","L3_NORMALISEE","L4_NORMALISEE","L5_NORMALISEE","L6_NORMALISEE","L7_NORMALISEE","L1_DECLAREE","L2_DECLAREE","L3_DECLAREE","L4_DECLAREE","L5_DECLAREE","L6_DECLAREE","L7_DECLAREE","NUMVOIE","INDREP","TYPVOIE","LIBVOIE","CODPOS","CEDEX","RPET","LIBREG","DEPET","ARRONET","CTONET","COMET","LIBCOM","DU","TU","UU","EPCI","TCD","ZEMET","SIEGE","ENSEIGNE","IND_PUBLIPO","DIFFCOM","AMINTRET","NATETAB","LIBNATETAB","APET700","LIBAPET","DAPET","TEFET","LIBTEFET","EFETCENT","DEFET","ORIGINE","DCRET","DATE_DEB_ETAT_ADM_ET","ACTIVNAT","LIEUACT","ACTISURF","SAISONAT","MODET","PRODET","PRODPART","AUXILT","NOMEN_LONG","SIGLE","NOM","PRENOM","CIVILITE","RNA","NICSIEGE","RPEN","DEPCOMEN","ADR_MAIL","NJ","LIBNJ","APEN700","LIBAPEN","DAPEN","APRM","ESSEN","DATEESS","TEFEN","LIBTEFEN","EFENCENT","DEFEN","CATEGORIE","DCREN","AMINTREN","MONOACT","MODEN","PRODEN","ESAANN","TCA","ESAAPEN","ESASEC1N","ESASEC2N","ESASEC3N","ESASEC4N","VMAJ","VMAJ1","VMAJ2","VMAJ3","DATEMAJ"
"005720164","00028","SA SAINTE ISABELLE","","","236 ROUTE D AMIENS","","80100 ABBEVILLE","FRANCE","SA SAINTE-ISABELLE","","","236 RTE D AMIENS","","80100 ABBEVILLE","","236","","RTE","D AMIENS","80100","","32","Nord-Pas-de-Calais-Picardie","80","1","98","001","ABBEVILLE","80","4","01","248000556","41","2209","1","","1","O","201209","","","8610Z","Activités hospitalières","2008","22","100 à 199 salariés","100","2015","1","19830928","19830928","NR","99","","P","S","O","","0","SA SAINTE-ISABELLE","","","","","","00028","32","80001","","5599","SA à conseil d'administration (s.a.i.)","8610Z","Activités hospitalières","2008","","","","22","100 à 199 salariés","100","2015","ETI","19570101","201209","1","S","O","","","","","","","","","","","","2014-07-30T00:00:00"
"005720784","00031","ETABLISSEMENTS DECAYEUX","","","ZONE INDUSTRIELLE","","80210 FEUQUIERES EN VIMEU","FRANCE","ETABLISSEMENTS DECAYEUX","","","ZONE INDUSTRIELLE","","80210 FEUQUIERES EN VIMEU","","","","","ZONE INDUSTRIELLE","80210","","32","Nord-Pas-de-Calais-Picardie","80","1","17","308","FEUQUIERES EN VIMEU","80","1","18","248000630","15","0055","0","","1","O","201209","","","2572Z","Fabrication de serrures et de ferrures","2008","22","100 à 199 salariés","100","2015","4","19930401","19930401","NR","99","","P","S","O","","0","ETABLISSEMENTS DECAYEUX","","","","","","00015","32","80308","","5710","SAS/// société par actions simplifiée","2599A","Fabrication d'articles métalliques ménagers","2008","","N","20160915","32","250 à 499 salariés","200","2015","ETI","19570101","201209","3","S","O","2012","6","2599A","2599A","2599B","2572Z","4649Z","","","","","2001-12-13T00:00:00"
This is an extract from the csv
Each time I have an "SIRET" as a sirets value, but the other var increments and changes everytime
Thank you so much ++
Related
export of scraping data to csv
New in python, I scraped a site to get data like season, teams and position. I want to save the data in a CSV. The problem is that the data all get listed on one line. I would like to have a result like this: Below my code: import pdb import re import os import json import pandas as pd '''Structurer les données dans un taleur''' Base=[] Saison=[] Position=[] Equipe=[] for ele in os.listdir('Data/Saison'): with open('Data/Saison/'+ele,'r',encoding='utf8') as output: contenu = output.read() saison=ele.replace('.html','') Saison.append(saison) pattern='<td class="left first__left strong">(.{1,8})</td>' position=re.findall(pattern,contenu) Position.append(position) pattern='<a class="list-team-entry" href="/fr/basketball/equipe/(.{1,4})' id_equipe=re.findall(pattern,contenu) pattern='<a class="list-team-entry" href="/fr/basketball/equipe/(.{1,30})" title="(.{1,20})">' ens=re.findall(pattern,contenu) for e in ens: equipe=e[1] Equipe.append(equipe) pattern='<td class="left highlight">(.{1,8})</td>' pourcent_victoire=re.findall(pattern,contenu) Base.append([Saison,Position,Equipe]) '''On enregistre en CSV''' df=pd.DataFrame(Base, columns = ['saison','position','equipe']) df.to_csv('DataFinal/Base.csv',sep='|',encoding='utf8',index=False) ` Thanks for you help
Web scraping returning empty dictionary
I'm trying to scrape all the data from this website https://ricetta.it/ricette-secondi using Python-Selenium. I'd like to put them into a dictionary, as seen from the code below. However, this is just returning an empty list back. import pprint detail_recipes = [] for recipe in list_recipes: title = "" description = "" ingredient = "" if(len(recipe.find_elements_by_css_selector(".post-title")) > 0): title = recipe.find_elements_by_css_selector(".post-title")[0].text if(len(recipe.find_elements_by_css_selector(".post-excerpt")) > 0): description = recipe.find_elements_by_css_selector(".post-excerpt")[0].text if(len(recipe.find_elements_by_css_selector(".nm-ingr")) > 0): ingredient = recipe.find_elements_by_css_selector(".nm-ingr")[0].text detail_recipes.append({'title': title, 'description': description, 'ingredient': ingredient }) len(detail_recipes) pprint.pprint(detail_recipes[0:10])
You can try this: import requests import numpy as np from bs4 import BeautifulSoup as bs import pandas as pd url="https://ricetta.it/ricette-secondi" page=requests.get(url) soup=bs(page.content,'lxml') df={'title': [],'description': [],'ingredient':[]} for div in soup.find_all("div",class_="post-bordered"): df["title"].append(div.find(class_="post-title").text) try: df["description"].append(div.find(class_="post-excerpt").text) except: df["description"].append(np.nan) i=div.find_all(class_="nm-ingr") if len(i)>0: df["ingredient"].append([j.text for j in i]) else: df["ingredient"].append(np.nan) df=pd.DataFrame(df) df.dropna(axis=0,inplace=True) print(df) Output: title ... ingredient 0 Polpette di pane e formaggio ... [uovo, pane, pangrattato, parmigiano, latte, s... 1 Torta 7 vasetti alle melanzane ... [uovo, olio, latte, yogurt, farina 00, fecola ... 2 Torta a sole con zucchine e speck ... [pasta sfoglia, zucchina, ricotta, uovo, speck... 3 Pesto di limoni ... [limone, pinoli, parmigiano, basilico, prezzem... 4 Bombe di patate ... [patata, farina 00, uovo, parmigiano, sale e p... 5 Polpettone di zucchine ... [zucchina, uovo, parmigiano, pangrattato, pros... 6 Insalata di pollo ... [petto di pollo, zucchina, pomodorino, insalat... 7 Club sandwich ... [pane, petto di pollo, pomodoro, lattuga, maio... 8 Crostata di verdure ... [farina 00, burro, acqua, sale, zucchina, pomo... 9 Pesto di barbabietola ... [barbabietola, parmigiano, pinoli, olio, sale,... [10 rows x 3 columns] I don't know if you use these library or not, but that website doesn't uses javascript to load data, so we can scrape that website using requests and bs4. Most of the people prefer to use these library, if website doesn't uses javascript to load data. It is easy and faster then selenium. And for showing/displaying data I am using pandas with is also preferable library for working on table like data. It exactly print data in table like structure and you can save that scraped data in csv, excel file also. If you want to scrape all of the data from next page also then try this: df={'title': [],'description': [],'ingredient':[]} for i in range(0,108): url=f"https://ricetta.it/ricette-secondi?page={i}" page=requests.get(url) soup=bs(page.content,'lxml') for div in soup.find_all("div",class_="post-bordered"): df["title"].append(div.find(class_="post-title").text) try: df["description"].append(div.find(class_="post-excerpt").text) except: df["description"].append(np.nan) i=div.find_all(class_="nm-ingr") if len(i)>0: df["ingredient"].append([j.text for j in i]) else: df["ingredient"].append(np.nan) It will scrape all of the 107 pages of data from that website. You can save this df to csv or excel file by using : df.to_csv("<filename.csv>") # or for excel: df.to_excel("<filename.xlsx>") Edit : As you ask you want to scrape, link of all recipes, I have figure out two things, first just replace space of titles by - and that is the link for that recipe and another is scrape link from there, for that you can use this piece of code: div.find(class_="post-title")["href"] It will return the link of that recipe. And for another approach you can do this: df["links"]=df["title"].apply(lambda x: "https://ricetta.it/"+x.replace(" ","-").lower()) #.lower() is just to not make like a random text but it you remove it also it works. But I personally suggest you just to scrape link from website cuz while making link own our own we may made mistakes.
Webscraping with BeautifulSoup in Python tags
I am currently trying to scrape some information from the following link: http://www2.congreso.gob.pe/Sicr/TraDocEstProc/CLProLey2001.nsf/ee3e4953228bd84705256dcd008385e7/4ec9c3be3fc593e2052571c40071de75?OpenDocument I would like to scrape some of the information in the table using BeautifulSoup in Python. Ideally I would like to scrape the "Groupo Parliamentario," "Titulo," "Sumilla," and "Autores" from the table as separate items. So far I've developed the following code using BeautifulSoup: from bs4 import BeautifulSoup import requests import pandas as pd url = 'http://www2.congreso.gob.pe/Sicr/TraDocEstProc/CLProLey2001.nsf/ee3e4953228bd84705256dcd008385e7/4ec9c3be3fc593e2052571c40071de75?OpenDocument' page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') table = soup.find('table', {'bordercolor' : '#6583A0'}) contents = [] summary = [] authors = [] contents.append(table.findAll('font')) authors.append(table.findAll('a')) What I'm struggling with is that the code to scrape the authors only scrapes the first author in the list. Ideally I need to scrape all of the authors in the list. This seems odd to me because looking at the html code for the webpage, all authors in the list are indicated with '<a href = >' tags. I would think table.findAll('a')) would grab all of the authors in the list then. Finally, I'm sort of just dumping the rest of the very messy html (title, summary, parliamentary group) all into one long string under contents. I'm not sure if I'm missing something, I'm sort of new to html and webscraping, but would there be a way to pull these items out and store them individually (ie: storing just the title in an object, just the summary in an object, etc). I'm having a tough time identifying unique tags to do this in the code for the web page. Or is this something I should just clean and parse after scraping?
to get the authors you can use: soup.find('input', {'name': 'NomCongre'})['value'] output: 'Santa María Calderón Luis,Alva Castro Luis,Armas Vela Carlos,Cabanillas Bustamante Mercedes,Carrasco Távara José,De la Mata Fernández Judith,De La Puente Haya Elvira,Del Castillo Gálvez Jorge,Delgado Nuñez Del Arco José,Gasco Bravo Luis,Gonzales Posada Eyzaguirre Luis,León Flores Rosa Marina,Noriega Toledo Víctor,Pastor Valdivieso Aurelio,Peralta Cruz Jonhy,Zumaeta Flores César' to scrape Grupo Parlamentario table.find_all('td', {'width': 446})[1].text output: 'Célula Parlamentaria Aprista' to scrape Título: table.find_all('td', {'width': 446})[2].text output: 'IGV/SELECTIVO:D.L.821/LEY INTERPRETATIVA ' to scrape Sumilla: table.find_all('td', {'width': 446})[3].text output: ' Propone la aprobación de una Ley Interpretativa del Texto Original del Numeral 1 del Apéndice II del Decreto Legislativo N°821,modificatorio del Texto Vigente del Numeral 1 del Apéndice II del Texto Único Ordenado de la Ley del Impuesto General a las Ventas y Selectivo al Consumo,aprobado por Decreto Supremo N°054-99-EF. '
Creating a master lists of a bunch of seperate lists in Python
I have scraped text data from a PDF and used split to divide it into separate pages which I append to an empty list called pages. However, this results in pages being a object that just contains a whole bunch of separate list objects for each page. What I want is a master list that contains all of these separate list objects so that when I check for print(type(pages)) I get one list instead of a long printout of individual lists. # creating a pdf File object of original pdf pdfFileObj = open(origFileName, 'rb') # creating a pdf Reader object pdfReader = PyPDF2.PdfFileReader(pdfFileObj) numPages = pdfReader.numPages for p in range(pdfReader.numPages): # creating page object pageObj = pdfReader.getPage(p) #extract txt from pageObj into unicode string object pages = list() for x in (pageObj.extractText()).split('\n'): pages.append(x) print(pages) The result is very long but a sample looks like: ['DESCRIPTIONSULTLetter Type: PLFile Set: SHPPL20190827'] ["Dear A**** P***** ,This letter is to let you know that SING DY UY will no longer provide STAR MRSA services for *******HealthPlan effective . This means that SING DY UY cannot be your provider any more through Superior. You can pick a new provider from the ******* HealthPlan network to get services.Superior will work with you to make sure you get the care you need. There is a provider in your areawho can provide services to you. That provider is JOSE BENIGNO JR. You can call them at1-***-362-2685."] ['Estimado(a) ****** *****,Le enviamos esta carta para informarle que SING DY UY dejar de ofrecer servicios de STAR MRSApara ******* HealthPlan a partir del . Esto significa que SING DY UY ya no podr ser su proveedor atravs de Superior. Para obtener servicios, puede escoger un nuevo proveedor de la red de S****** HealthPlan.Superior trabajar con usted para garantizar que reciba la atencin que necesita. Hay un proveedor ensu rea que puede ofrecerle servicios. Ese proveedor es **** BENIGNO JR. Puede comunicarse al1-432-362-2685.]
Your question wasn't very clear, but what I've done is made a list called pages, and for each page we iterate through, we add the list of lines (called page now) to the list of pages. pdfFileObj = open(origFileName, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) numPages = pdfReader.numPages pages = [] for p in range(pdfReader.numPages): pageObj = pdfReader.getPage(p) page = list() for x in pageObj.extractText().split('\n'): page.append(x) pages.append(page)
You'll have to append pages to your master list. Change your for loop to the following, and it should work. master_lst = list() for p in range(pdfReader.numPages): pageObj = pdfReader.getPage(p) pages = list() for x in pageObj.extractText().split('\n'): pages.append(x) master_lst.append(pages) However, this code isn't Pythonic. The inner (nested) for loop is redundant, because pageObj.extractText().split('\n') returns a list of strings, so there's no need to iterate through it. master_lst = list() for p in range(pdfReader.numPages): pages = pdfReader.getPage(p).extractText().split('\n') master_lst.append(pages) Actually, this block can be replaced by a single line of code. Use list comprehension. master_lst = [pdfReader.getPage(p).extractText().split('\n') for p in range(pdfReader.numPages)] The complete code would be as follows. import PyPDF2 pdfFileObj = open(origFileName, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) numPages = pdfReader.numPages master_lst = [pdfReader.getPage(p).extractText().split('\n') for p in range(pdfReader.numPages)] If you are unfamiliar with list comprehensions, I suggest reading about it here on StackOverflow.
Parsing webpage that is all text
I'm trying to parse webpage that is a plain text document, it's encoded in HTML so I tried using BeautifulSoup to pull out the text and make a list, but I wasn't able to. <body> <pre> -------------------- BDMEP - INMET -------------------- Estação : PONTA PORA - MS (OMM: 83702) Latitude (graus) : -22.55 Longitude (graus) : -55.71 Altitude (metros): 650.00 Estação Operante Inicio de operação: 24/11/1941 Periodo solicitado dos dados: 01/01/2015 a 17/11/2016 Os dados listados abaixo são os que encontram-se digitados no BDMEP Hora em UTC -------------------- Obs.: Os dados aparecem separados por ; (ponto e vírgula) no formato txt. Para o formato planilha XLS, siga as instruções -------------------- Estacao;Data;Hora;Precipitacao;TempMaxima;TempMinima;Insolacao;Evaporacao Piche;Temp Comp Media;Umidade Relativa Media;Velocidade do Vento Media; 83702;01/01/2015;0000;;;;;;;73.5;3.333333; 83702;06/01/2016;1200;5;;;;;;;; 83702;07/01/2016;0000;;;;;;;76.25;2.40072; 83702;01/02/2016;1200;15.2;;;;;;;; </pre> </body> I'm interested in: Piche;Temp Comp Media;Umidade Relativa Media;Velocidade do Vento Media; 83702;01/01/2015;0000;;;;;;;73.5;3.333333; 83702;06/01/2016;1200;5;;;;;;;; 83702;07/01/2016;0000;;;;;;;76.25;2.40072; 83702;01/02/2016;1200;15.2;;;;;;;; Ideally to construct a DataFrame and save as a CSV. So far I tried stuff like: soup = BeautifulSoup(a.content, 'html.parser') soup = soup.find_all('pre') text = [] for i in soup: print(i) text.append(i) But it has not done the trick. It makes it all one entry in the list.
BS is usefull for HTML tags but you have mostly text so use string functions like split('\n') and slicing [start_row:end_row] your HTML text content = '''<body> <pre> -------------------- BDMEP - INMET -------------------- Estação : PONTA PORA - MS (OMM: 83702) Latitude (graus) : -22.55 Longitude (graus) : -55.71 Altitude (metros): 650.00 Estação Operante Inicio de operação: 24/11/1941 Periodo solicitado dos dados: 01/01/2015 a 17/11/2016 Os dados listados abaixo são os que encontram-se digitados no BDMEP Hora em UTC -------------------- Obs.: Os dados aparecem separados por ; (ponto e vírgula) no formato txt. Para o formato planilha XLS, siga as instruções -------------------- Estacao;Data;Hora;Precipitacao;TempMaxima;TempMinima;Insolacao;Evaporacao Piche;Temp Comp Media;Umidade Relativa Media;Velocidade do Vento Media; 83702;01/01/2015;0000;;;;;;;73.5;3.333333; 83702;06/01/2016;1200;5;;;;;;;; 83702;07/01/2016;0000;;;;;;;76.25;2.40072; 83702;01/02/2016;1200;15.2;;;;;;;; </pre> </body>''' and from bs4 import BeautifulSoup soup = BeautifulSoup(content, 'html.parser') text = soup.find('pre').text lines = text.split('\n') print(lines[-6:-1]) or in one line print(content.split('\n')[-7:-2]) If table has more rows then you can search last ---------------- to find start of table last = content.rfind(' --------------------') lines = content[last:].split('\n') print(lines[1:-2]) And now you can split lines into columns using split(';') to create data for pandas :) Or use io.StringIO to create file-like object in memory and use pd.read_csv() import pandas as pd import io last = content.rfind(' --------------------') lines = content[last:].split('\n')[1:-2] # create one string with table text = '\n'.join(lines) # create file-like object with text fileobject = io.StringIO(text) # use file-like object with read_csv() df = pd.read_csv(fileobject, delimiter=';') print(df) or import pandas as pd import io start = content.rfind(' --------------------') start += len(' --------------------') end = content.rfind(' </pre>') text = content[start:end] fileobject = io.StringIO(text) df = pd.read_csv(fileobject, delimiter=';') print(df)
you need re to do this job in: import re re.findall(r'\w+;.+\n', string=html) out: ['Estacao;Data;Hora;Precipitacao;TempMaxima;TempMinima;Insolacao;Evaporacao Piche;Temp Comp Media;Umidade Relativa Media;Velocidade do Vento Media;\n', '83702;01/01/2015;0000;;;;;;;73.5;3.333333;\n', '83702;06/01/2016;1200;5;;;;;;;;\n', '83702;07/01/2016;0000;;;;;;;76.25;2.40072;\n', '83702;01/02/2016;1200;15.2;;;;;;;;\n']