I am trying to parse the table of this site. I am using python beautiful soup to do that. While it's producing correct output in my Ubuntu 14.04 machine, it's producing wrong output in my friend's windows machine. I am pasting the code snippet here:
from bs4 import BeautifulSoup def buildURL(agi, families): #agi and families contains space seperated string of genes and families genes = agi.split(" ") families = families.split(" ") base_url = "http://www.athamap.de/search_gene.php" url = base_url if len(genes): url = url + "?agi=" for i, gene in enumerate(genes): if i>0: url = url + "%0D%0A" url = url + gene url = url + "&upstream=-500&downstream=50&restriction=0&sortBy1=gen&sortBy2=fac&sortBy3=pos" for family in families: family = family.replace("/", "%2F") url = url +"&familySelected%5B"+family+"%5D=on" url = url + "&formSubmitted=TRUE" return url def fetch_html(agi, families): url = buildURL(agi, families) response = requests.get(url) soup = BeautifulSoup(str(response.text), "lxml") divs = soup.find_all('div') seldiv = "" for div in divs: try: if div["id"] == "geneAnalysisDetail": ''' This div contains interesting data ''' seldiv = div except: None return seldiv def parse(seldiv): soup = seldiv rows= soup.find_all('tr') attributes =["Gene", "Factor", "Family", "Position", "Relative orientation", "Relative Distance", "Max score", "Threshold Score", "Score"] print attributes save_rows = [] for i in range(2, len(rows)): cols = rows[i].find_all('td') lst = [] for j,col in enumerate(cols): if j==0: lst.append(re.sub('', '',str(col.contents[1].contents[0]))) elif j==1: lst.append(str(col.contents[1].contents[0])) elif j==2: lst.append(str(col.contents[0])) elif j==3: lst.append(str(col.contents[1].contents[0])) else: lst.append(str(col.contents[0])) save_rows.append(lst) return save_rows
Any idea what could go wrong here? I have tried with and without lxml.
Thanks in advance.
tags
in some cases on windows ?? – MD. Khairul Basar