i am new to programming , i have created a webscraper in python using beautiful soup but when i run this program it opens python command line and and just cursor blink on it and nothing happens...and now i receive these error
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
...please dont mind the indentation,
below are my codes:
import urllib.request
import urllib
import json
import xml.etree.ElementTree as ET
import csv
from bs4 import BeautifulSoup
link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist'
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka"
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict"
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName"
alldata = []
links = {}
certificatedata = []
def getData(url, values):
data = urllib.parse.urlencode(values)
data = data.encode('utf-8')
req = urllib.request.Request(url, data)
response=urllib.request.urlopen(req)
data = response.read()
data = data.decode("utf-8")
return data
def getDivsion():
## for now we are taking 6 districts.. it needs to updated when the data
gets updatedd
return range(1,7)
def getDistrict(divId):
global distlink
values = {'DivID': divId}
data = getData(distlink, values)
return data
def parseJson(data):
parsed = json.loads(data)
return parsed
def getTaluka(disId):
global talukaLink
values= {'DisID': disId}
data = getData(talukaLink, values)
return data
def getProjects(divId, disId):
global prjLink
values= {'DisID': disId, 'DivID': divId}
#print(values)
data = getData( prjLink, values)
if len(data)<10:
return "{}"
return data
def getProjectsList():
divList = getDivsion()
flag = 0
for divId in divList:
disData = getDistrict(divId)
disList = parseJson(disData)
for disObj in disList:
disId = disObj["ID"]
prjData = getProjects(divId, disId)
#print(" >>>> "+str(disId)+" >> "+str(divId))
#print(prjData)
prjJson = parseJson(prjData)
for prjObj in prjJson:
flag += 1
prjId = prjObj["ID"]
values = {'ID':0, 'pageTraverse': 1, 'Division': divId, 'hdnDistrict': '', 'hdnProject':'', 'District': disId, 'Taluka':'', 'Village': '', 'Project': prjId, 'CertiNo':'', 'btnSearch':'Search'}
finalPrjData = getData(link, values)
parseXMLData(finalPrjData)
#if len(alldata)>100:
# break
def parseXMLData(htmldata):
global alldata, links
soup = BeautifulSoup(htmldata, "html.parser")
tables = soup.find_all("table")
for table in tables:
print(len(alldata))
attr = table.attrs
if "table" in attr['class']:
tbody = table.find_all("tbody")
if len(tbody)>0:
tbody = tbody[0]
tr_lst = tbody.find_all("tr")
for tr in tr_lst:
sublist = []
td_lst = tr.find_all("td")
if len(td_lst)>6:
prjname = td_lst[1].text
proname = td_lst[2].text
certNo = td_lst[3].text
sublist.append(prjname)
sublist.append(proname)
sublist.append(certNo)
td = td_lst[4]
a_lst = td.find_all("a")
if len(a_lst)>0:
a = a_lst[0]
href = a.attrs['href']
link = "https://maharerait.mahaonline.gov.in/"+href
links[certNo] = link
sublist.append(link)
if len(sublist)>0:
alldata.append(sublist)
return alldata
def writedata(alldata1, filename):
print(" >>>> FINAL PRINTING DATA >>>> ")
#import pdb; pdb.set_trace()
with open("./"+filename,'w') as csvfile:
csvfile = csv.writer(csvfile, delimiter=',')
#csvfile.writerow(titleRow)
csvfile.writerow("")
for i in range(0, len( alldata1 )):
#print(alldata1[i])
csvfile.writerow( alldata1[i] )
def processlinksforcert():
global links, certificatedata
print(">> Came in fetching certificates data >>> " )
for certno in links.keys():
link = links[certno]
htmldata = getData(link, {})
soup = BeautifulSoup(htmldata, "html.parser")
divs = soup.find_all("div")
for div in divs:
attr = div.attrs
if "id" in attr.keys() and "DivProfessional" in attr['id']:
table = div.find_all("table")
if len(table)<=0:
continue
t_attr = table[0].attrs
if "table" in t_attr["class"]:
print(len(certificatedata))
table = table[0]
tr_lst = table.find_all("tr")
index = 1
while index<len(tr_lst):
#import pdb; pdb.set_trace()
#for tr in tr_lst:
#if index==0:
# continue
tr = tr_lst[index]
index += 1
sublist = []
td_lst = tr.find_all("td")
if len(td_lst)>2:
sublist.append(certno)
pername = formattext( td_lst[0].text)
cerno = formattext( td_lst[1].text )
proftype = formattext( td_lst[2].text )
sublist.append(pername)
sublist.append(cerno)
sublist.append(proftype)
certificatedata.append(sublist)
return certificatedata
def formattext(text):
while text.find("\r\n")>=0:
text = text.replace("\r\n","")
while text.find(" ")>=0:
text = text.replace(" ","")
return text
def main():
global alldata, certificatedata
#data = getData(url, {})
getProjectsList()
print("Before write the projects data to the file. Count >>
"+str(len(alldata)))
writedata(alldata, "data.csv")
data = processlinksforcert()
print("Before write the certificates data to the file. Count >>
"+str(len(data)))
writedata( data, "certificates.csv" )
main()
can someone pleases suggest what am i doing wrong...i have everything installed pip and pip beautifulsoup also..please dont mind the indentation, it is just for here....
https://maharerait.mahaonline.gov.in/SearchList/GetTaluka. I think probable you have to pass cookies or login info to your destination site to allow code get data. As for now it can't access source that's why it crushes after expiration time. - Grynets