Problem with cookies in beautifulsoup loop - python
I collect several information on a site, societe.com. For a few pages it works fine. Here is part of my shortened code
import pandas as pd
import requests
from bs4 import BeautifulSoup
urlsbourgogne = [
"https://www.societe.com/societe/AGILIS-COMPTABILITE-902252782.html",
"https://www.societe.com/societe/ALD-VOLAILLES-877535864.html",
]
headers = {
"User-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
}
data = []
for url in urlsbourgogne:
soup = BeautifulSoup(
requests.get(url, headers=headers).content, "html.parser"
)
title = soup.select_one("#identite_deno").get_text(strip=True)
try:
jugement = list(soup.select_one('td.red').stripped_strings)[0].split('le ')[0]
except:
print("Je n'ai pas trouvé de type de jugement pour " + title)
jugement = "En activité"
try:
typeact = form = soup.select_one("#ape-histo-description").get_text(strip=True).split(" (")[0]
except:
print("Je n'ai pas trouvé de type d'activité pour " + title)
typeact = "None"
data.append([title, url , jugement, typeact])
df = pd.DataFrame(
data,
columns=["Title", "URL" ,"Active" "Type d'activité"],
)
But my problem is when I launch this loop with a large number of urls the site "blocks" me and displays fictitious pages with false information. For example here are some of the urls I would like to use (sorry it's a bit long)
urlsbourgogne = [
"https://www.societe.com/societe/AGILIS-COMPTABILITE-902252782.html",
"https://www.societe.com/societe/ALD-VOLAILLES-877535864.html",
"https://www.societe.com/societe/ALLIANCE-ESCALIER-791602170.html",
"https://www.societe.com/societe/ARCHIPLAN-878552645.html",
"https://www.societe.com/societe/B-TAM-(BETON-TECHNIQUES-AMELIORATIONS-528367741.html",
"https://www.societe.com/societe/B.V.M.-337641898.html",
"https://www.societe.com/societe/B2M-508740941.html",
"https://www.societe.com/societe/BIDEAU-SECTEUR-SANTE-538094814.html",
"https://www.societe.com/societe/BIOAUTOCENTER-838121424.html",
"https://www.societe.com/societe/BMN-OPTIC-893675140.html",
"https://www.societe.com/societe/CG-MULTI-COM-813187788.html",
"https://www.societe.com/societe/CHALON-CONSEIL-EXPERTISE-BOURGOGNE-478873078.html",
"https://www.societe.com/societe/CHATEAU-ELEGANCE-534269956.html",
"https://www.societe.com/societe/D.L.H.-481789147.html",
"https://www.societe.com/societe/DC-COMPANY-831435037.html",
"https://www.societe.com/societe/DEVECI-803963354.html",
"https://www.societe.com/societe/ESPACE-ANIMAL-849236443.html",
"https://www.societe.com/societe/EURL-CPS-71-498295583.html",
"https://www.societe.com/societe/FAINDT-CONSEIL-DEVELOPPEMENT-753631365.html",
"https://www.societe.com/societe/FRANCE-CUST'HOME-840435176.html",
"https://www.societe.com/societe/FRANCE-RENOVATION-ET-CONSTRUCTION-882213440.html",
"https://www.societe.com/societe/INDUSTRIE-BOURGUIGNONNE-DE-LA-CONSTRU-850780875.html",
"https://www.societe.com/societe/INTERPAIN-822877551.html",
"https://www.societe.com/societe/JBK-MULTI-SERVICES-808106132.html",
"https://www.societe.com/societe/JFG-CONDUITE-539108647.html",
"https://www.societe.com/societe/LA-CADOLE-837853548.html",
"https://www.societe.com/societe/LA-TROPICALE-808438295.html",
"https://www.societe.com/societe/LE-CLOS-DES-TOURELLES-489163691.html",
"https://www.societe.com/societe/LES-NECTARS-DE-MARINE-888982972.html",
"https://www.societe.com/societe/LES-PIASTRES-420307175.html",
"https://www.societe.com/societe/LES-TENDANCES-D'ALODIS-901642652.html",
"https://www.societe.com/societe/LHS-ELECTRO-AMEUBLEMENT-384367835.html",
"https://www.societe.com/societe/LT-COM-804735215.html",
"https://www.societe.com/societe/LYON-BOURGOGNE-VTC-844296848.html",
"https://www.societe.com/societe/M.A.D.I.D.---MONTAGE-ASSISTANCE-DEPAN-808481550.html",
"https://www.societe.com/societe/MELIEWASH-887637866.html",
"https://www.societe.com/societe/MSV-751338443.html",
"https://www.societe.com/societe/N.R.J.-HABITAT-414387381.html",
"https://www.societe.com/societe/NIVERNAISE-DE-TRAVAUX-PUBLICS-PHILIPPEAU-950376582.html",
"https://www.societe.com/societe/PALAIS-D'AGADIR-753994086.html",
"https://www.societe.com/societe/PERFORMANCE-ET-TRADITION-490041175.html",
"https://www.societe.com/societe/POTERIE-NORMAND-SN-830796827.html",
"https://www.societe.com/societe/PRH-21-817394265.html",
"https://www.societe.com/societe/PRO-FILTRATION71-822592572.html",
"https://www.societe.com/societe/R-SERVICES-852189679.html",
"https://www.societe.com/societe/REFERENCE-COIFFURE-391911831.html",
"https://www.societe.com/societe/SAM'EVEIL-852758945.html",
"https://www.societe.com/societe/SARL-DAUVERGNE-414090811.html",
"https://www.societe.com/societe/SARL-ECS-890074180.html",
"https://www.societe.com/societe/SARL-MENUISERIE-BOYER-432882934.html",
"https://www.societe.com/societe/SAS-7'IDEAL-844229609.html",
"https://www.societe.com/societe/STBS-821854619.html",
"https://www.societe.com/societe/STOC-ASSAINISSEMENT-341938710.html",
"https://www.societe.com/societe/T.F.B-801699182.html",
"https://www.societe.com/societe/TPC-TRANSPORT-EXPRESS-494102106.html",
"https://www.societe.com/societe/TRUST-BARBER-SHOP-877618199.html",
"https://www.societe.com/societe/VERANDA-DES-AMOGNES-848424537.html",
"https://www.societe.com/societe/VERNIAU-VINCENT-812261618.html",
"https://www.societe.com/societe/A3XX-445387475.html",
"https://www.societe.com/societe/AFFINAGE-D'ALUMINIUM-PREMERY-809693658.html",
"https://www.societe.com/societe/AJC-DISTRIBUTION-449093095.html",
"https://www.societe.com/societe/AMIRO-843832080.html",
"https://www.societe.com/societe/ANALYSE-DIAG-EXPERT-BATIMENT-849818422.html",
"https://www.societe.com/societe/AR'ELEC-498992270.html",
"https://www.societe.com/societe/ASSYSTRONIC-528707680.html",
"https://www.societe.com/societe/ATELIER-AMB-838572253.html",
"https://www.societe.com/societe/ATOL-LES-OPTICIENS-LIMOGES-538695610.html",
"https://www.societe.com/societe/AU-NEUVY-SOIS-850222019.html",
"https://www.societe.com/societe/AUX-DOUCEURS-D'ETANG-837596766.html",
"https://www.societe.com/societe/AUX-RICOCHETS-877773796.html",
"https://www.societe.com/societe/BADRI-&-CONSTRUCTION-853737864.html",
"https://www.societe.com/societe/BEAUNE-COLLECTOR-752092361.html",
"https://www.societe.com/societe/BOURGOGNE-ENERGIE-CHAUFFAGE-CLIMATISA-808185623.html",
"https://www.societe.com/societe/BOUZID-794032078.html",
"https://www.societe.com/societe/BUROPA-SARL-378436257.html",
"https://www.societe.com/societe/CAPUCINES-CO-834616302.html",
"https://www.societe.com/societe/CHOUETTE-BUSINESS-853161768.html",
"https://www.societe.com/societe/CMC-BOURGOGNE-488344474.html",
"https://www.societe.com/societe/COCO-CREATION-888524105.html",
"https://www.societe.com/societe/CONCEPT-BATI-DECO-PLAISANCE-517499778.html",
"https://www.societe.com/societe/CONCEPT-BAUMGARTEN-841374580.html",
"https://www.societe.com/societe/CUMA-ASTER-304850845.html",
"https://www.societe.com/societe/DAVID-BERNARD-530946656.html",
"https://www.societe.com/societe/DIJON-RELAIS-CARTE-GRISE-842384018.html",
"https://www.societe.com/societe/DS-AUTO-VO-825296114.html",
"https://www.societe.com/societe/ECO-CONSTRUCTION-503496176.html",
"https://www.societe.com/societe/EDDY-SAHRAOUI-840429807.html",
"https://www.societe.com/societe/ELLYPSE-889445193.html",
"https://www.societe.com/societe/ESPACE-PARE-BRISE-GLASS-501232813.html",
"https://www.societe.com/societe/EURL-BICLOUNE-PIQUET-887651123.html",
"https://www.societe.com/societe/EURL-PLYMS-841084023.html",
"https://www.societe.com/societe/FIRE-PROTECT-524120870.html",
"https://www.societe.com/societe/FORM-VITALE-433558186.html",
"https://www.societe.com/societe/FRANCK-ESPACES-VERTS-ET-MULTISERVICES-448837476.html",
"https://www.societe.com/societe/GALERIE-PREVOTEAU-DU-CLARY-902974963.html",
"https://www.societe.com/societe/GLS-ELECTRICITE-510933690.html",
"https://www.societe.com/societe/GROUPE-ARTISANAL-DU-BATIMENT-821969805.html",
"https://www.societe.com/societe/HELLO-WORLD-824882427.html",
"https://www.societe.com/societe/HIGHEREDME-814311247.html",
"https://www.societe.com/societe/HOPALE-845025121.html",
"https://www.societe.com/societe/J2MV-518942123.html",
"https://www.societe.com/societe/JULEX-898377254.html",
"https://www.societe.com/societe/KOSTECKI-LANNY-798413340.html",
"https://www.societe.com/societe/L'ATELIER-GOURMAND-841829898.html",
"https://www.societe.com/societe/L'ATELIER-MEDICAL-850846510.html",
"https://www.societe.com/societe/L'EPISODE-853446144.html",
"https://www.societe.com/societe/LA-BICYCLETTE-DE-PAUL-844989574.html",
"https://www.societe.com/societe/LA-BOUTIQUE-DES-SERVICES-752545665.html",
"https://www.societe.com/societe/LA-MAISON-DE-LA-PIZZA-818225807.html",
"https://www.societe.com/societe/LA-MESURE-DES-POSSIBLES-893185454.html",
"https://www.societe.com/societe/LA-PEINTURE-FRANCAISE-822949194.html",
"https://www.societe.com/societe/LA-VIEILLE-TOUR-804460384.html",
"https://www.societe.com/societe/LE-STUDIO-819886979.html",
"https://www.societe.com/societe/LES-PETITS-COLIBRIS-840001085.html",
"https://www.societe.com/societe/LOIRE-MEDICAL-SANTE-499285757.html",
"https://www.societe.com/societe/M-B-CONSEIL-448431429.html",
"https://www.societe.com/societe/MACON-EVENT-840834881.html",
"https://www.societe.com/societe/MH-CLEAN-CONFLUENCE-899227359.html",
"https://www.societe.com/societe/MON-KOCHON-891295545.html",
"https://www.societe.com/societe/NEW-INFORMATIQUE-SYSTEM-SERVICES-839506011.html",
"https://www.societe.com/societe/O.T.O.P-849205950.html",
"https://www.societe.com/societe/ODABAS-792577033.html",
"https://www.societe.com/societe/OXYGENE-LA-DECO-QUI-RESPIRE-887884278.html",
"https://www.societe.com/societe/PALLE-DAMIEN-481522175.html",
"https://www.societe.com/societe/PECHIODAT-331169672.html",
"https://www.societe.com/societe/PIERRE-PAULIN-504978081.html",
"https://www.societe.com/societe/PRO-SANITAIRE-531154441.html",
"https://www.societe.com/societe/PROFESSIONNELS-L-ENTRETIEN-AUTOMOBILES-437668635.html",
"https://www.societe.com/societe/PYRAMIDE-450734595.html",
"https://www.societe.com/societe/SARL-LE-DANI-GOURMAND-520564691.html",
"https://www.societe.com/societe/SAS-ANDRE-BARBOSA-890709231.html",
"https://www.societe.com/societe/SAS-BRASSERIE-DU-GRAND-BOIS-899806780.html",
"https://www.societe.com/societe/SAS-MOREL-753954627.html",
"https://www.societe.com/societe/SAS-NF-CONSTRUCTION-853882967.html",
"https://www.societe.com/societe/SAVEUR-DE-ROME-832030308.html",
"https://www.societe.com/societe/SELARL-RUELLE-WEBER-GAMBIER-534922828.html",
"https://www.societe.com/societe/SENS-515015659.html",
"https://www.societe.com/societe/SYSTEME-V-818436453.html",
"https://www.societe.com/societe/TAOSO-822806659.html",
"https://www.societe.com/societe/TRD-823286968.html",
"https://www.societe.com/societe/VERT-ET-CO-809242506.html",
"https://www.societe.com/societe/VINCENT-DEBUT-887831766.html",
"https://www.societe.com/societe/VOTRE-QUOTIDIEN-FACILE-841654445.html",
"https://www.societe.com/societe/ADOMIE-849316740.html",
"https://www.societe.com/societe/ALEX-CONSTRUCTION-&-RENOVATION-831234893.html",
"https://www.societe.com/societe/APH-794633693.html",
"https://www.societe.com/societe/ARBO-STYL-504103615.html",
"https://www.societe.com/societe/AU-BON-ENDROIT-898543616.html",
"https://www.societe.com/societe/AU-P'TIT-MORVAN-803080506.html",
"https://www.societe.com/societe/BATIMENT-CONSTRUCTION-RENOVATION-SARL-891807596.html",
"https://www.societe.com/societe/BLACK-LION-879760916.html",
"https://www.societe.com/societe/BOUCHERIE-LA-QUEUE-DE-BOEUF-897804951.html",
"https://www.societe.com/societe/CONCEPT-HABITAT-825189525.html",
"https://www.societe.com/societe/D-R-H-484393632.html",
"https://www.societe.com/societe/DIJON-3D-823618632.html",
"https://www.societe.com/societe/DOYOUNO-SAS-831125620.html",
"https://www.societe.com/societe/DS-CAR-840753479.html",
"https://www.societe.com/societe/EFL-GERY-847766227.html",
"https://www.societe.com/societe/ETABLISSEMENTS-PATOUILLET-435820048.html",
"https://www.societe.com/societe/ETUDES-&-CONCEPTS-ECLAIRAGE-DISTRI-480509736.html",
"https://www.societe.com/societe/FCR-453373763.html",
"https://www.societe.com/societe/FINANCIERE-HELP-820221190.html",
"https://www.societe.com/societe/FL-POWER-804071173.html",
"https://www.societe.com/societe/GILPAYSAGES-828628735.html",
"https://www.societe.com/societe/GRZ-CONSULTING-850730169.html",
"https://www.societe.com/societe/IJK-847542032.html",
"https://www.societe.com/societe/JTG-SERVICES-848915062.html",
"https://www.societe.com/societe/KETOUFA-502211816.html",
"https://www.societe.com/societe/KIWI-CUBE-540011913.html",
"https://www.societe.com/societe/L'ATELIER-D'ANNE-MARIE-848605606.html",
"https://www.societe.com/societe/L'EMPREINTE-831166723.html",
"https://www.societe.com/societe/LES-DEUX-TILLEULS-881440358.html",
"https://www.societe.com/societe/MAISON-VERTE-58-829113844.html",
"https://www.societe.com/societe/MEGNAUD-FRERES-378867089.html",
"https://www.societe.com/societe/MENUISERIE-ET-AMENAGEMENT-CONCERTE-829772714.html",
"https://www.societe.com/societe/MG-CONSTRUCTION-BOURGOGNE-838647832.html",
"https://www.societe.com/societe/MH-CLEAN-VENISSIEUX-889028015.html",
"https://www.societe.com/societe/MINOTERIE-MEGNAUD-511914236.html",
"https://www.societe.com/societe/MURET-&-DESCHAMPS-843981788.html",
"https://www.societe.com/societe/MY-AUTOMEDON-811217868.html",
"https://www.societe.com/societe/NAULIN-FINANCES-435382288.html",
"https://www.societe.com/societe/NIVERNAISE-DES-FARINES---MOULIN-DES-C-840717375.html",
"https://www.societe.com/societe/ON.LY-848546875.html",
"https://www.societe.com/societe/PHILIPPE-GOURDON-410931208.html",
"https://www.societe.com/societe/SARL-LE-BOREAL-828332429.html",
"https://www.societe.com/societe/SARL-SILA-802237776.html",
"https://www.societe.com/societe/SARL-TRANSPORTS-GRONDIN-848874970.html",
"https://www.societe.com/societe/SERENITY-831000476.html",
"https://www.societe.com/societe/STEPHANE-LEVOIN-BRUNET-810194126.html",
"https://www.societe.com/societe/TC-RYORI-842166274.html",
"https://www.societe.com/societe/TD-PEINTURE-INDUSTRIELLE-ELECTROSTATIQUE-811520618.html",
"https://www.societe.com/societe/TOPH-EXPRESS-532578283.html",
"https://www.societe.com/societe/VAGAIS-841675481.html",
"https://www.societe.com/societe/VAN-DE-WEGE-503170292.html",
"https://www.societe.com/societe/VAOVERT-833729783.html",
"https://www.societe.com/societe/VAPO-CHIC.COM-799271853.html",
"https://www.societe.com/societe/VELO-LIBERTY-853476372.html",
"https://www.societe.com/societe/VERT-PUISAYE-823627476.html",
"https://www.societe.com/societe/VICTOR-GAYET-821616687.html",
"https://www.societe.com/societe/VILUX-839270436.html",
"https://www.societe.com/societe/ACTI-LIVRES-483597902.html",
"https://www.societe.com/societe/ALCHIMIE-BOURGOGNE-512534678.html",
"https://www.societe.com/societe/ALLIANCE-DES-FORESTIERS-833419666.html",
"https://www.societe.com/societe/ATELIER-THAIS-810300210.html",
"https://www.societe.com/societe/AU-BON-POINT-894196112.html",
"https://www.societe.com/societe/BFE-828434027.html",
"https://www.societe.com/societe/BROYER-MACONNERIE-850641473.html",
"https://www.societe.com/societe/CASA-D'ISABEL-881645170.html",
"https://www.societe.com/societe/COMMUNICATION-ACTIVE-SARL-348142225.html",
"https://www.societe.com/societe/D.T.C.-INTERNATIONAL-831455894.html",
"https://www.societe.com/societe/DIMOBAT-505003418.html",
"https://www.societe.com/societe/DISTRIPIZ89-899740542.html",
"https://www.societe.com/societe/DOMAINE-DES-CHAUCHOUX-344692041.html",
"https://www.societe.com/societe/ENTREPRISE-RENARD---B-838063279.html",
"https://www.societe.com/societe/ETABLISSEMENTS-SARIC-ET-P.-FILS-820008282.html",
"https://www.societe.com/societe/EURL-BOI-815158308.html",
"https://www.societe.com/societe/FC-CONCEPT-SASU-827788621.html",
"https://www.societe.com/societe/FOYER-DES-JEUNES-TRAVAILLEURS-DU-CREUSOT-397677873.html",
"https://www.societe.com/societe/G.S.D-PARAY-887980415.html",
"https://www.societe.com/societe/GARCIA-489314963.html",
"https://www.societe.com/societe/GUILLOT---ECHALLIER-TOPOGRAPHIE-493219679.html",
"https://www.societe.com/societe/HAPPY-HOUSE-881441778.html",
"https://www.societe.com/societe/HURRICANE-US-GARAGE-495381790.html",
"https://www.societe.com/societe/K-TIA-901189001.html",
"https://www.societe.com/societe/KAYABEY-BOIS-809663396.html",
"https://www.societe.com/societe/L'ECOLE-BUISSONNIERE-848710323.html",
"https://www.societe.com/societe/LA-GAZELLE-900306580.html",
"https://www.societe.com/societe/LA-GRANGE-CHALON-897455937.html",
"https://www.societe.com/societe/LES-DELICES-DE-FLORALYSA-797644440.html",
"https://www.societe.com/societe/LIBERTISSIMMO-532637634.html",
"https://www.societe.com/societe/LOST-WORLD-839395449.html",
"https://www.societe.com/societe/MARIEJP-879514370.html",
"https://www.societe.com/societe/MAX-D-832825434.html",
"https://www.societe.com/societe/NEGOCIANT-DE-LA-BRUYERE-751176504.html",
"https://www.societe.com/societe/NIDAL-903145787.html",
"https://www.societe.com/societe/PALMEIRA-509294930.html",
"https://www.societe.com/societe/PRALOC-&-ASSOCIES-852866367.html",
"https://www.societe.com/societe/RACH'L-834721524.html",
"https://www.societe.com/societe/RENOTECH-HABITAT-818251472.html",
"https://www.societe.com/societe/SAJE-COIFFURE-830531950.html",
"https://www.societe.com/societe/SARL-BARREAU-891358269.html",
"https://www.societe.com/societe/SARL-EURO-ALU-EQUIPEMENT-430418343.html",
"https://www.societe.com/societe/SARL-HENRY-BATIMENT-825171077.html",
"https://www.societe.com/societe/SARL-PARC-DE-LOISIRS-DU-CHABLISIEN-491815718.html",
"https://www.societe.com/societe/SBM-812258747.html",
"https://www.societe.com/societe/SP2B-833510100.html",
"https://www.societe.com/societe/STEELYRAV-812614204.html",
"https://www.societe.com/societe/SVPAC-SYNERGIES-VALO-PROMO-AUTO-COLLEC-452869472.html",
"https://www.societe.com/societe/TABARD-843270364.html",
"https://www.societe.com/societe/TRADI-FACAD-448296277.html",
"https://www.societe.com/societe/TVLL-844169078.html",
"https://www.societe.com/societe/URBAN-BENTO-897503306.html",
"https://www.societe.com/societe/ABW-831379698.html",
"https://www.societe.com/societe/AGIMENUISERIE-821342649.html",
"https://www.societe.com/societe/ARBEO-438067381.html",
"https://www.societe.com/societe/ART-METAL-842627242.html",
"https://www.societe.com/societe/ASSISTANCE-ET-SERVICES-DE-CONCEPTION-839926110.html",
"https://www.societe.com/societe/AUTO-SOINS-881948699.html",
"https://www.societe.com/societe/BATIMENT-ET-RENOVATION-ROLLET-PATRICK-452342629.html",
"https://www.societe.com/societe/BDC-889418240.html",
"https://www.societe.com/societe/BELFRANCE-752451732.html",
"https://www.societe.com/societe/BG-FORESTIER-71-851037424.html",
"https://www.societe.com/societe/BK-FIBRES-842228504.html",
"https://www.societe.com/societe/BVM-PROMOTION-491816179.html",
"https://www.societe.com/societe/CHEZ-FRANCOISE-ET-SEBASTIEN-477987978.html",
"https://www.societe.com/societe/CHOCOLAT-494725302.html",
"https://www.societe.com/societe/CMB-DIETETIQUE-831389432.html",
"https://www.societe.com/societe/D.S.I.P-824074314.html",
"https://www.societe.com/societe/DIJON-PIECES-MENAGER-451667398.html",
"https://www.societe.com/societe/ENTREPRISE-AUGUSTE-ROSSI-309046787.html",
"https://www.societe.com/societe/EQUIPEER-828989962.html",
"https://www.societe.com/societe/FORESTLAG-811882703.html",
"https://www.societe.com/societe/HOME-DESIGN-850100348.html",
"https://www.societe.com/societe/J.L.H.-RESTAURATION-819162108.html",
"https://www.societe.com/societe/JOSEPHINE-DECO-841433378.html",
"https://www.societe.com/societe/LA-VIE-BUISSONNIERE-SCIC-SARL-888221363.html",
"https://www.societe.com/societe/LE-LYLOU-882574940.html",
"https://www.societe.com/societe/MEDIAGROUP-499701589.html",
"https://www.societe.com/societe/ORIGAMI-PACK-ET-PLV-523048593.html",
"https://www.societe.com/societe/ROY-SERVICES-832204275.html",
"https://www.societe.com/societe/S.M.C.-SASU-809190333.html",
"https://www.societe.com/societe/SARL-PAUL-PERCHE-401125646.html",
"https://www.societe.com/societe/SAS.LE-RESTO-889618534.html",
"https://www.societe.com/societe/SOCIETE-DE-MENUISERIE-BOIS-PELLETIER-401943063.html",
"https://www.societe.com/societe/SOCIETE-T-ET-B-FONCIERE-451558894.html",
"https://www.societe.com/societe/T.R.E.E.-ENERGIE-820555738.html",
"https://www.societe.com/societe/TETRANE-530448232.html",
"https://www.societe.com/societe/TRAVAUX-RESEAUX-ENVIRONNEMENT-ELAGAGE-522414754.html",
"https://www.societe.com/societe/WILLYTIC-813995966.html",
"https://www.societe.com/societe/A.B.C-834429169.html",
"https://www.societe.com/societe/ABS-STAR-822854428.html",
"https://www.societe.com/societe/AE.BTP-800427320.html",
"https://www.societe.com/societe/AGM-PEINTURE-502841687.html",
"https://www.societe.com/societe/ALLIANCE-CONCEPT-893068098.html",
"https://www.societe.com/societe/ARI-880929070.html",
"https://www.societe.com/societe/B.C.I.F.-441504313.html",
"https://www.societe.com/societe/B.C.I.F.-ADMINISTRATIONS---COLLECTIVITES-442335212.html",
"https://www.societe.com/societe/BORN-TO-QUILT-528625353.html",
"https://www.societe.com/societe/BOULANGERIE-PATISSERIE-ADAM-825086820.html",
"https://www.societe.com/societe/BTD-SYSTEMS-498099290.html",
"https://www.societe.com/societe/CONCEPT-BOIS-DESIGN-852730969.html",
"https://www.societe.com/societe/EPICERIE-DE-GILIUS-839650025.html",
"https://www.societe.com/societe/EXPLOITATION-FORESTIERE-DE-LA-BRUYERE-490953817.html",
"https://www.societe.com/societe/GLT-LINE-893645622.html",
"https://www.societe.com/societe/GSA-DISTRIBUTION-813445624.html",
"https://www.societe.com/societe/HARAS-DE-LA-MERLUCHERIE-534368139.html",
"https://www.societe.com/societe/HORIZON-ZEN-851104661.html",
"https://www.societe.com/societe/IMEJI-443472428.html",
"https://www.societe.com/societe/IMMO-2A-381957638.html",
"https://www.societe.com/societe/ISIMY-851837484.html",
"https://www.societe.com/societe/L2M-RENOVATION-808619381.html",
"https://www.societe.com/societe/LA-PENICHE-D'AUXERRE-819915034.html",
"https://www.societe.com/societe/LE-BOEUF-QUI-CHANTE-830078366.html",
"https://www.societe.com/societe/LEC-PLOMBERIE-CHAUFFAGE-831215389.html",
"https://www.societe.com/societe/LES-TOILES-HEBERGEMENT-EPHEMERE-881596407.html",
"https://www.societe.com/societe/MAZENQ-JEAN-YVES-852619592.html",
"https://www.societe.com/societe/METAL-CONSTRUCTION-842656670.html",
"https://www.societe.com/societe/MISS-WAFFLE-830167607.html",
"https://www.societe.com/societe/MM-PRO-BATIMENT-853618957.html",
"https://www.societe.com/societe/MULTI-VARIABLE-FORMATION-884983123.html",
"https://www.societe.com/societe/NOES-848078713.html",
"https://www.societe.com/societe/NPM-MARKET-834711657.html",
"https://www.societe.com/societe/PHARMACIE-FOCH-794467407.html",
"https://www.societe.com/societe/PYXAL-448845941.html",
"https://www.societe.com/societe/RECYCLAGE-PIECES-POTS-METAUX-504880519.html",
"https://www.societe.com/societe/RO2-AVENTURE-528654924.html",
"https://www.societe.com/societe/S.T.A-821027612.html",
"https://www.societe.com/societe/SARL-BODY-LIFT-881244479.html",
"https://www.societe.com/societe/SARL-LES-BOULANGERIES-DE-GERMIGNY-414049619.html",
"https://www.societe.com/societe/SARL-PLANCON-FRERES-423906932.html",
"https://www.societe.com/societe/SASU-SEDRAT-ETANCHEITE-819590357.html",
"https://www.societe.com/societe/SIRAC-DIJON-812194926.html",
"https://www.societe.com/societe/SOCIETE-NOUVELLE-GENNETIER-TECHNIQUES-419974084.html",
"https://www.societe.com/societe/STYLE-ET-CONCEPT-888337946.html",
"https://www.societe.com/societe/T.E.V.L-824467682.html",
"https://www.societe.com/societe/TS-AGRICOLES-823752886.html",
"https://www.societe.com/societe/UTILIMOTORS-811238039.html",
"https://www.societe.com/societe/VIDE'HOME-823806849.html",
"https://www.societe.com/societe/Y.B.K-789860780.html",
"https://www.societe.com/societe/ACTA-PUBLICA-844517102.html",
"https://www.societe.com/societe/AG-INDUSTRIES-SAS-819108143.html",
"https://www.societe.com/societe/ALINE-THAI-SPA-850667841.html",
"https://www.societe.com/societe/AM-MENUISERIE-845346279.html",
"https://www.societe.com/societe/ASM-FERMETURES-820375814.html",
"https://www.societe.com/societe/AV-OCCAS-21-839243466.html",
"https://www.societe.com/societe/BEKER-821260791.html",
"https://www.societe.com/societe/BHC-505056879.html",
"https://www.societe.com/societe/C.M.L.-LEVAGE-500697750.html",
"https://www.societe.com/societe/CGE-AUTOMOBILES-799044102.html",
"https://www.societe.com/societe/CHAMPS-PLAISANTS-830955407.html",
"https://www.societe.com/societe/DELAHAYE-423628676.html",
"https://www.societe.com/societe/DELOMAS-482852738.html",
"https://www.societe.com/societe/EN-ROUE-LIBRE-829209774.html",
"https://www.societe.com/societe/ENTREMONT-HOTELLERIE-434881827.html",
"https://www.societe.com/societe/EURL-LA-BOUTIQUE-793792904.html",
"https://www.societe.com/societe/EURL-MARECHAL-389271446.html",
"https://www.societe.com/societe/FIABILEX-814973889.html",
"https://www.societe.com/societe/FRANCE-PRO-RHF-829126523.html",
"https://www.societe.com/societe/ICAU-FRANCE-343081741.html",
"https://www.societe.com/societe/LA-BASE-879394799.html",
"https://www.societe.com/societe/LA-BELLE-MAISON-802577007.html",
]
I think the problem comes from cookies but I couldn't find anything on this subject and I'm a little stuck. Do you have a solution to successfully launch a loop with a large number of urls without being blocked?
thanks in advance
Apply "Cookie" in the headers dictionary like this.
headers = {
"User-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"cookie":"AHWqTUlhftkLIuXSbIVa5uKh77iLa_kw1Tx9rkm3xTMos06ERQq3MgXWSdg7-iCp9WA"
}
You can get a cookie from the DevTools of the browser. Go to the website, switch to the Application tab. Expand the Cookie section and then copy the cookie value.
I will also suggest you to add sleep intervals in between your code too. To use this, you will have to add import time at the top of the script.
time.sleep(2)
Related
Can't get all results in tripadvisor using python al beautifulsoup due to pagination
I am trying to get links of restaurants but i can only get the first 30 and not all the others. Restaurants in Madrid Area are hundreads, the pagination only shows 30 in each page and the following code only get those 30 import re import requests from openpyxl import Workbook from bs4 import BeautifulSoup as b city_name = 'Madrid' geo_code = '187514' headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } data = requests.get( "https://www.tripadvisor.com//Restaurants-g{}-{}.html".format(geo_code, city_name), headers=headers ).text for link in re.findall(r'"detailPageUrl":"(.*?)"', data): print("https://www.tripadvisor.com.sg/" + link) next_link = "https://www.tripadvisor.com.sg/" + link f.write('%s\n' % next_link)
Found the solution, had to add ao with number of the result in the url like: "https://www.tripadvisor.com//Restaurants-g{}-{}-{}.html".format(geo_code, city_name, n_review), headers=headers
Scraping PornHub Video redirects to Cornhub
I am making my first steps in webscraping and wanted to get Video Data from Pornhub. In a first step i went trough all the pages on the main page and collected the video links. This worked and i got a csv with around 100k links. If i copy/paste those links to the brower , those work fine. BUT, when i go over them with my script to get my desired values, it always redirects me to a Cornhub Video (i know this was an april fools day joke some time ago). So it seems that my request gets redirected, but i dont know how this happens and if i can do anything about it. '''with open("links.csv", "r") as f: lines = csv.reader(f) for adress in lines: data = [] print(data) headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}) sleep(randint(2,5)) html = requests.get(adress[0], headers=headers) soup = BeautifulSoup(html.text, features="html.parser") print(soup) views = soup.find("script", type="application/ld+json") json_data = json.loads(views.contents[0]) interaction_stat = json_data["interactionStatistic"] views = int(interaction_stat[0] ["userInteractionCount"].replace(",", "")) duration = int( soup.find("meta", property="video:duration").get("content")) upload_date = datetime.datetime.strptime( json_data["uploadDate"][0:10], '%Y-%m-%d').date() video_id = soup.find("form", id="shareToStream") video_id = video_id.find("input", id="attachment").get("value") data.append(video_id) data.append(upload_date) data.append(views) data.append(duration) with open("data.csv", "a", newline="") as f: # Das hier über die schleife um es nur einmal zu machen writer = csv.writer(f) writer.writerow(data) '''
Your headers are really old, but this works just fine. Maybe make sure you alternate your IP or take some time before the subsequent requests. import requests from bs4 import BeautifulSoup import json lines = [ "XX62f79e2ed1ed8", "XX63078405e84b6", ] headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Host": "www.pornhub.com", "Refer": "https://www.pornhub.com/", } with requests.Session() as s: for line in lines: soup = ( BeautifulSoup(s.get(line, headers=headers).text, features="html.parser") .find("script", type="application/ld+json") ) json_data = ( json .loads(soup.getText()) ['interactionStatistic'][0]['userInteractionCount'] ) print(json_data) For the videos I've used the output is: 3,339,324 384,482
Data are overwrite in pandas
When I make the csv file data are overwrite in csv file If there is any solution provide me the link of the page is https://www.aeafa.es/asociados.php?provinput=&_pagi_pg=1 have already searched for an answer here and spent a long time on google, but nothing... I've already tried opening the file with 'w' instead of 'r' or 'a' but I still can't get my code to import requests from bs4 import BeautifulSoup import pandas as pd headers ={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36' } for page in range(1,3): r =requests.get('https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}'.format(page=page), headers=headers) soup=BeautifulSoup(r.content, 'lxml') tag=soup.find_all('div',class_='col-md-8 col-sm-8') temp=[] for pro in tag: data=[tup.text for tup in pro.find_all('p')] Dirección=data[2] Dirección=Dirección[12:] Población=data[3] Población=Población[14:] Provincia=data[4] Provincia=Provincia[14:] Teléfono=data[5] Teléfono="+" + Teléfono[11:].replace('.', "") Email=data[6] Email=Email[10:] temp.append([Dirección,Provincia,Población,Teléfono, Email]) df=pd.DataFrame(temp,columns=["Dirección","Provincia","Población","Teléfono","Email"]) df.to_csv('samp.csv')
Try to put the list temp outside of the for-loop. Then, create the dataframe after all the loops finish: import requests import pandas as pd from bs4 import BeautifulSoup headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36" } temp = [] for page in range(1, 3): r = requests.get( "https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}".format( page=page ), headers=headers, ) soup = BeautifulSoup(r.content, "lxml") tag = soup.find_all("div", class_="col-md-8 col-sm-8") for pro in tag: data = [tup.text for tup in pro.find_all("p")] Dirección = data[2] Dirección = Dirección[12:] Población = data[3] Población = Población[14:] Provincia = data[4] Provincia = Provincia[14:] Teléfono = data[5] Teléfono = "+" + Teléfono[11:].replace(".", "") Email = data[6] Email = Email[10:] temp.append([Dirección, Provincia, Población, Teléfono, Email]) df = pd.DataFrame( temp, columns=["Dirección", "Provincia", "Población", "Teléfono", "Email"] ) df.to_csv("samp.csv") print(len(df)) Prints: 98 Screenshot from LibreOffice:
Scrape URL loop with BeautifulSoup
I want to scrap information on different pages of the same site, societe.com and I have several questions. first of all here is the code that I managed to do, I am a bit of a novice I admit it I only put 2 URLs to see if the loop worked and some information, I can add some when everything works urls = ["https://www.societe.com/societe/decathlon-france-500569405.html","https://www.societe.com/societe/go-sport-312193899.html"] for url in urls: response = requests.get(url, headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}) soup = BeautifulSoup(response.text, "html.parser") numrcs = soup.find("td", class_="numdisplay") nomcommercial = soup.find("td", class_="break-word") print(nomcommercial.text) print(numrcs.text.strip()) numsiret = soup.select('div[id^=siret_number]') for div in numsiret: print(div.text.strip()) formejuri = soup.select('div[id^=catjur-histo-description]') for div in formejuri: print(div.text.strip()) infosend = { 'numrcs': numrcs, 'nomcommercial':nomcommercial, 'numsiret':numsiret, 'formejuri':formejuri } tableau.append(infosend) print(tableau) my_infos = ['Numéro RCS', 'Numéro Siret ','Forme Juridique'] my_columns = [ np.tile(np.array(my_infos), len(nomcommercial)) ] df = pd.DataFrame( tableau,index=nomcommercial, columns=my_columns) df When I run the loop I have the right information coming out, like for example DECATHLON FRANCE Lille Metropole B 500569405 50056940503239 SASU Société par actions simplifiée à associé unique but I would like to put all this information in a table but I can't really, only the last company appears and the data makes no sense I tried to follow a tutorial without success. if you can help me i would be really happy
To get data about the companies you can use next example: import requests import pandas as pd from bs4 import BeautifulSoup urls = [ "https://www.societe.com/societe/decathlon-france-500569405.html", "https://www.societe.com/societe/go-sport-312193899.html", ] headers = { "User-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" } data = [] for url in urls: soup = BeautifulSoup( requests.get(url, headers=headers).content, "html.parser" ) title = soup.select_one("#identite_deno").get_text(strip=True) rcs = soup.select_one('td:-soup-contains("Numéro RCS") + td').get_text( strip=True ) siret_number = soup.select_one("#siret_number").get_text(strip=True) form = soup.select_one("#catjur-histo-description").get_text(strip=True) data.append([title, url, rcs, siret_number, form]) df = pd.DataFrame( data, columns=["Title", "URL", "Numéro RCS", "Numéro Siret", "Forme Juridique"], ) print(df.to_markdown()) Prints: Title URL Numéro RCS Numéro Siret Forme Juridique 0 DECATHLON FRANCE (DECATHLON DIRECTION GENERALE FRANCE) https://www.societe.com/societe/decathlon-france-500569405.html Lille Metropole B 500569405 50056940503239 SASU Société par actions simplifiée à associé unique 1 GO SPORT https://www.societe.com/societe/go-sport-312193899.html Grenoble B 312193899 31219389900191 Société par actions simplifiée
Printing Text Scraped Using BeautifulSoup to Pandas Dataframe without Tags
I have been working on the code below and getting myself tied up in knots. What I am trying to do is build a simple dataframe using text scraped using BeautifulSoup. I have scraped the applicable text from the <h5> and <p> tags but using find_all means that when I build the dataframe and write to csv the tags are included. To deal with this I have added the print(p.text, end=" ") statements but now nothing is being written to the csv. Can anyone see what I am doing wrong? import pandas as pd import requests from bs4 import BeautifulSoup headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', } course = [] runner = [] page = requests.get('https://www.attheraces.com/tips/atr-tipsters/hugh-taylor', headers=headers) soup = BeautifulSoup(page.content, 'html.parser') tips = soup.find('div', class_='sticky') for h5 in tips.find_all("h5"): course_name = print(h5.text, end=" ") course.append(course_name) for p in tips.find_all("p"): runner_name = print(p.text, end=" ") runner.append(runner_name) todays_tips = pd.DataFrame( {'Course': course, 'Selection': runner, }) print(todays_tips) todays_tips.to_csv(r'C:\Users\*****\Today.csv')
Don't use the assignment for print and consider using a list comprehension. Applying this should get you the dataframe you want. For example: import pandas as pd import requests from bs4 import BeautifulSoup headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', } page = requests.get('https://www.attheraces.com/tips/atr-tipsters/hugh-taylor', headers=headers) tips = BeautifulSoup(page.content, 'html.parser').find('div', class_='sticky') course = [h5.getText() for h5 in tips.find_all("h5")] runner = [p.getText() for p in tips.find_all("p")] todays_tips = pd.DataFrame({'Course': course, 'Selection': runner}) print(todays_tips) todays_tips.to_csv("your_data.csv", index=False) Output: Course Selection 0 1.00 HAYDOCK 1pt win RAINBOW JET (12-1 & 11-1 general) 1 2.50 GOODWOOD 1pt win MARSABIT (11-2 general) And a .csv file: