Web parsing with python beautifulsoup producing inconsistent result

Question

I am trying to parse the table of this site. I am using python beautiful soup to do that. While it's producing correct output in my Ubuntu 14.04 machine, it's producing wrong output in my friend's windows machine. I am pasting the code snippet here:

from bs4 import BeautifulSoup

def buildURL(agi, families):
    #agi and families contains space seperated string of genes and families
    genes = agi.split(" ")
    families = families.split(" ")
    base_url = "http://www.athamap.de/search_gene.php"

    url = base_url

    if len(genes):
        url = url + "?agi="
        for i, gene in enumerate(genes):
            if i>0:
                url = url + "%0D%0A"
            url = url + gene

    url = url + "&upstream=-500&downstream=50&restriction=0&sortBy1=gen&sortBy2=fac&sortBy3=pos"

    for family in families:
        family = family.replace("/", "%2F")
        url = url +"&familySelected%5B"+family+"%5D=on"
    url = url + "&formSubmitted=TRUE"
    return url

def fetch_html(agi, families):

    url = buildURL(agi, families)
    response = requests.get(url)

    soup = BeautifulSoup(str(response.text), "lxml")

    divs = soup.find_all('div')

    seldiv = ""
    for div in divs:
        try:
            if div["id"] == "geneAnalysisDetail":
                '''
                    This div contains interesting data
                '''
                seldiv = div
        except:
            None

    return seldiv

def parse(seldiv):
    soup = seldiv
    rows= soup.find_all('tr')

    attributes =["Gene", "Factor", "Family", "Position", "Relative orientation", "Relative Distance", "Max score", "Threshold Score", "Score"]

    print attributes
    save_rows = []
    for i in range(2, len(rows)):
        cols = rows[i].find_all('td')
        lst = []
        for j,col in enumerate(cols):
            if j==0:
                lst.append(re.sub('', '',str(col.contents[1].contents[0])))
            elif j==1:
                lst.append(str(col.contents[1].contents[0]))
            elif j==2:
                lst.append(str(col.contents[0]))
            elif j==3:
                lst.append(str(col.contents[1].contents[0]))
            else:
                lst.append(str(col.contents[0]))
        save_rows.append(lst)
    return save_rows

Any idea what could go wrong here? I have tried with and without lxml.

Thanks in advance.

What do you mean "it's producing the wrong output"? Do you get an error, do you get different values? — asongtoruin
What different values its producing ? Is it giving values with tags in some cases on windows ?? — MD. Khairul Basar

MD. Khairul Basar MD. Khairul Basar · Accepted Answer · 2017-04-04T12:01:32

You can parse the table this way and should work well on both machine. buildURL function should be left unchanged.

import requests
from bs4 import BeautifulSoup

def fetch_html(url):

    response = requests.get(url)

    soup = BeautifulSoup(response.text, "lxml")

    seldiv = soup.find("div", id="geneAnalysisDetail")

    return seldiv

def parse(url):
    soup = fetch_html(url)
    rows= soup.find_all("tr")

    attributes = ["Gene", "Factor", "Family", "Position", "Relative orientation", "Relative Distance", "Max score", "Threshold Score", "Score"]

    save_rows = []
    for i in range(2, len(rows)):
        cols = rows[i].find_all("td")
        lst = []
        for col in cols:
            text = col.get_text()
            text = text.strip(" ")
            text = text.strip("\n")
            lst.append(text)
        save_rows.append(lst)
    return save_rows

url = "http://www.athamap.de/search_gene.php?agi=At1g76540%0D%0AAt3g12280%0D%0AAt4g28980%0D%0AAt4g37630%0D%0AAt5g11300%0D%0AAt5g27620%0D%0A&upstream=-500&downstream=50&restriction=0&sortBy1=gen&sortBy2=fac&sortBy3=pos&familySelected[ARF]=on&familySelected[CAMTA]=on&familySelected[GARP%2FARR-B]=on&formSubmitted=TRUE"
save_rows = parse(url)
for row in save_rows:
    print(row)

Web parsing with python beautifulsoup producing inconsistent result

2 Answers