I am trying to parse the table of this site. I am using python beautiful soup to do that. While it's producing correct output in my Ubuntu 14.04 machine, it's producing wrong output in my friend's windows machine. I am pasting the code snippet here:
from bs4 import BeautifulSoup
def buildURL(agi, families):
#agi and families contains space seperated string of genes and families
genes = agi.split(" ")
families = families.split(" ")
base_url = "http://www.athamap.de/search_gene.php"
url = base_url
if len(genes):
url = url + "?agi="
for i, gene in enumerate(genes):
if i>0:
url = url + "%0D%0A"
url = url + gene
url = url + "&upstream=-500&downstream=50&restriction=0&sortBy1=gen&sortBy2=fac&sortBy3=pos"
for family in families:
family = family.replace("/", "%2F")
url = url +"&familySelected%5B"+family+"%5D=on"
url = url + "&formSubmitted=TRUE"
return url
def fetch_html(agi, families):
url = buildURL(agi, families)
response = requests.get(url)
soup = BeautifulSoup(str(response.text), "lxml")
divs = soup.find_all('div')
seldiv = ""
for div in divs:
try:
if div["id"] == "geneAnalysisDetail":
'''
This div contains interesting data
'''
seldiv = div
except:
None
return seldiv
def parse(seldiv):
soup = seldiv
rows= soup.find_all('tr')
attributes =["Gene", "Factor", "Family", "Position", "Relative orientation", "Relative Distance", "Max score", "Threshold Score", "Score"]
print attributes
save_rows = []
for i in range(2, len(rows)):
cols = rows[i].find_all('td')
lst = []
for j,col in enumerate(cols):
if j==0:
lst.append(re.sub('', '',str(col.contents[1].contents[0])))
elif j==1:
lst.append(str(col.contents[1].contents[0]))
elif j==2:
lst.append(str(col.contents[0]))
elif j==3:
lst.append(str(col.contents[1].contents[0]))
else:
lst.append(str(col.contents[0]))
save_rows.append(lst)
return save_rows
Any idea what could go wrong here? I have tried with and without lxml.
Thanks in advance.
tagsin some cases on windows ?? - MD. Khairul Basar