I am working on a web scraping project. The first dropdown menu has about 800 options and the second dropdown menu has over 20 values. The process of doing it is very slow. So I tried to use multiprocessing in the hope that it could accelerate the process a little bit. However, I got the error messages I couldn't solve.
My codes are:
def create_df(city_var, year_var):
city = Select(driver.find_element_by_css_selector("select[id*='Main_csCity_ddlEntity1']"))
city.select_by_visible_text(city_var)
year = Select(driver.find_element_by_css_selector("select[id*='Main_csCity_ddlYear1']"))
year.select_by_visible_text(year_var)
try:
driver.find_element_by_xpath('//input[@type="submit"]').click()
except Exception as e:
time.sleep(1)
driver.find_element_by_xpath('//input[@type="submit"]').click()
print('something wrong:'+city_var+year_var)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
try:
small_header = soup.find_all("div",{"class":"ResultsHeader"})
ret_list = []
for idx, span in enumerate(small_header[0].find_all("span")):
if idx in [1,3,5,7]:
ret_list.append(span.contents[0])
except:
print(city_var+year_var)
try:
second_header = soup.find_all("tr",{"class":re.compile('Detail.*')})
ret_list2 = []
for idx, content in enumerate(second_header):
if len(content.contents) == 3:
ret_list2.append([content.contents[1].contents[0], '', '', ''])
elif len(content.contents) == 7:
sublist = []
for idx2 in range(5):
if idx2 == 1:
continue
sublist.append(content.contents[idx2+1].contents[0])
ret_list2.append(sublist)
else:
print('WRONG')
except:
print(city_var+year_var)
ret_list3 = ret_list2[1:]
ret_list4 = [ret_list+sub for sub in ret_list3]
return pd.DataFrame(ret_list4)
list_of_city_year = [[x,y] for x in cities1 for y in years]
def return_df(list1):
df = pd.DataFrame()
c = list1[0]
y = list1[1]
df = df.append(create_df(c, y))
return df
with Pool(5) as p:
records = p.map(return_df, list_of_city_year[:100])
The error message is pretty long. It outputs the previous results as well, so I only put the error part:
MaybeEncodingError Traceback (most recent call last) in () 1 with Pool(5) as p: ----> 2 records = p.map(return_df, list_of_city_year[:100])
~/anaconda3/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize) 264 in a list that is returned. 265 ''' --> 266 return self._map_async(func, iterable, mapstar, chunksize).get() 267 268 def starmap(self, func, iterable, chunksize=None):
~/anaconda3/lib/python3.6/multiprocessing/pool.py in get(self, timeout) 642 return self._value 643 else: --> 644 raise self._value 645 646 def _set(self, i, obj):
MaybeEncodingError: Error sending result: '[ 0 1 2
3 4 5 \ ..... ..... ....]'. Reason: 'RecursionError('maximum recursion depth exceeded while calling a Python object',)'
If you have any suggestion on how to improve the code to make it more efficient, please post it below.