How do I handle web pages with missing elements in web scraping? See my code below

Question

I wrote this code to scrape data from mobile phone categories from Flipkart. The problem I am facing is getting an Attribute error when an element is there ( "AttributeError: 'NoneType' object has no attribute 'text'"). How can I modify this code to work. If an element is there I need the data to be populated as " Not Available. See the code below. I'm a beginner in programming and any help would be appreciated.

'''

import requests

from bs4 import BeautifulSoup

import csv

import re

base_url = "https://www.flipkart.com/search?q=mobiles&page="

def get_urls(): with open("fliplart-data.csv", "a") as csv_file:

    writer = csv.writer(csv_file)

    writer.writerow(
        ['Product_name', 'Price', 'Rating', 'Product-url'])

    for page in range(1, 510):

        page = base_url + str(page)

        response = requests.get(page).text

        soup = BeautifulSoup(response, 'lxml')

        for product_urls in soup.find_all('a', href=True, attrs={'class': '_1fQZEK'}):
            name = product_urls.find('div', attrs={'class': '_4rR01T'}).text


            price = product_urls.find('div', attrs={'class': '_30jeq3 _1_WHN1'}).text
            price = re.split("\₹", price)
            price = price[-1]


            rating = product_urls.find('div', attrs={'class': '_3LWZlK'}).text


            item_url = soup.find('a', class_="_1fQZEK", target="_blank")['href']

            item_url = " https://www.flipkart.com" + item_url

            item_url = re.split("\&", item_url)

            item_url = item_url[0]


            print(f'Product name is {name}')

            print(f'Product price is {price}')

            print(f'Product rating is {rating}')

            print(f'Product url is {item_url}')


            writer.writerow(
                [name, price, rating, item_url])

get_urls()

'''

Jamin Jamin · Accepted Answer · 2021-03-19T10:16:51

It looks like you could be trying to surround the strings with try/catch exception handling and if there is an AttributeError like that, and use the except block to set the string to "Not Available" when there is an exception.

import requests

from bs4 import BeautifulSoup

import csv

import re

base_url = "https://www.flipkart.com/search?q=mobiles&page="

def get_urls(): 
    csv_file = open("fliplart-data.csv", "a")
    writer = csv.writer(csv_file)

    writer.writerow(
        ['Product_name', 'Price', 'Rating', 'Product-url'])

    for page in range(1, 510):

        page = base_url + str(page)

        response = requests.get(page).text

        soup = BeautifulSoup(response, 'lxml')

        for product_urls in soup.find_all('a', href=True, attrs={'class': '_1fQZEK'}):
            
            #name
            try:
                name = product_urls.find('div', attrs={'class': '_4rR01T'}).text
            except Exception as e:
                name = "Not Available"

            #price
            try:
                price = product_urls.find('div', attrs={'class': '_30jeq3 _1_WHN1'}).text
                price = re.split("\₹", price)
                price = price[-1]
            except Exception as e:
                price = "Not Available"

            #rating
            try:
                rating = product_urls.find('div', attrs={'class': '_3LWZlK'}).text
            except Exception as e:
                rating = "Not Available"
            #item_url
            try:
                item_url = soup.find('a', class_="_1fQZEK", target="_blank")['href']
                item_url = " https://www.flipkart.com" + item_url
                item_url = re.split("\&", item_url)
                item_url = item_url[0]
            except Exception as e:
                item_url = "Not Available"

            print(f'Product name is {name}')
            print(f'Product price is {price}')
            print(f'Product rating is {rating}')
            print(f'Product url is {item_url}')


            writer.writerow(
                [name, price, rating, item_url])

get_urls()

Output

Product name is intaek 5616
Product price is 789
Product rating is Not Available
Product url is  https://www.flipkart.com/kxd-m1/p/itm89bbc238d6356?pid=MOBFUXKG3DYVZRQV

From the look at the results from your scraping, the actually data doesnt match the url it is saying it is from. This might be part of the problem you are having as well.

How do I handle web pages with missing elements in web scraping? See my code below

1 Answers