I would scrape data from two html tables in Transfermarkt player profile page. Here is an example of page: https://www.transfermarkt.com/cristiano-ronaldo/profil/spieler/8198
First one is "Fact and data" table and second one is "stats" table. I want to start scraping from search pages and gets urls. Once I got urls from every single page of search pages start scraping stats for each player link.
How can I scrape data of html tables from that links?
Here my fully code
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
url_page="https://www.transfermarkt.com/detailsuche/spielerdetail/suche/27403221"
response = requests.get(url=url_page,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
response.elapsed.seconds
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all('table',class_='items'):
for link_pag in link.find_all(class_='spielprofil_tooltip'):
#add page loop
url_page="https://www.transfermarkt.com"+link_pag.attrs["href"]
response_pagina = requests.get(url=url_page,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
soup_pagina = BeautifulSoup(response_pagina.content, "html.parser")
time.sleep(3)
for n_player in soup_pagina('h1', itemprop="name"):
name = n_player.text
for value_player in soup_pagina('span', class_="waehrung"):
price = value_player.text
data_table = soup_pagina.find('table', class_='auflistung')
for data in data_table.find_all('tbody'):
rows = data.find_all('tr')
for row in rows:
try:
date_of_birth = row.find('td', [1]).text
except:
date_of_birth = ""
place_of_birth = row.find('td', [2]).text
age = row.find('td', [3]).text
height = row.find('td', [4]).text
citizenship = row.find('td', [5]).text
position = row.find('td', [6]).text
foot = row.find('td', [7]).text
agent = row.find('td', [8]).text
club = row.find('td', [9]).text
joined = row.find('td', [10]).text
contract_expired = row.find('td', [11]).text
contract_extension = row.find('td', [12]).text
stats_table = soup_pagina.find('table', class_='items')
for stats in stats_table.find_all('tfoot'):
rows_s = stats.find_all('td'):
for row_s in rows_s:
total = row.find('td', [3]).text
goal = row.find('td', [4]).text
assist = row.find('td', [5]).text
goal_per_min = row.find('td', [6]).text
total_min = row.find('td', [7]).text
data_stats = {
'name': name,
'price': price,
'data_of_birth': data_of_birth,
'place_of_birth': place_of_birth,
'age': age,
'height': height,
'citizenship': citizenship,
'position': position,
'foot': foot,
'agent': agent,
'club': club,
'joined': joined,
'contract_expired': contract_expired,
'contract_extension': contract_extension,
}
players_stats.append(data_stats)
players_stats = []
df = pd.DataFrame(players_stats)
print(df.head())
df.to_csv('players.csv', index=False)
