I'm trying to do a screen scrape using the cUrl library.
I managed to successfully screen scrape, few urls(5-10).
However whenever i run it in a for loop scraping a bulk(10-20) urls,
it will reach a point the last few urls will returns "HTTP/1.1 400 Bad Request".
Your browser sent a request that this server could not understand.
The number of request header fields exceeds this server's limit.
I'm pretty sure the urls are correct and correctly trimmed and the headers length are the same individually. If i put these last few urls on top of the list to scrape, it does go through, but the last few of the list again gets the 400 Bad request error. What could be the problem? What could be the cause?
Any advise ?
Something like below:
for($i=0;$i > sizeof($url);$i++) $data[$i] = $this->get($url[$i]); function get($url) { $this->headers[] = 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8, image/gif, image/x-bitmap, image/jpeg, image/pjpeg'; $this->headers[] = 'Connection: Keep-Alive'; $this->headers[] = 'Content-type: application/x-www-form-urlencoded;charset=UTF-8'; $this->user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.12) Gecko/20101026 Firefox/3.6.12 (.NET CLR 3.5.30729)'; set_time_limit(EXECUTION_TIME_LIMIT); $default_exec_time = ini_get('max_execution_time'); $this->redirectcount = 0; $process = curl_init($url); curl_setopt($process, CURLOPT_HTTPHEADER, $this->headers); curl_setopt($process, CURLOPT_HEADER, 1); curl_setopt($process, CURLOPT_USERAGENT, $this->user_agent); if ($this->cookies == TRUE) curl_setopt($process, CURLOPT_COOKIEFILE, $this->cookie_file); if ($this->cookies == TRUE) curl_setopt($process, CURLOPT_COOKIEJAR, $this->cookie_file); //off compression for debugging's sake //curl_setopt($process,CURLOPT_ENCODING , $this->compression); curl_setopt($process, CURLOPT_TIMEOUT, 180); if ($this->proxy) curl_setopt($process, CURLOPT_PROXY, $this->proxy); if ($this->proxyauth){ curl_setopt($process, CURLOPT_HTTPPROXYTUNNEL, 1); curl_setopt($process, CURLOPT_PROXYUSERPWD, $this->proxyauth); } curl_setopt($process, CURLOPT_RETURNTRANSFER, 1); curl_setopt($process, CURLOPT_FOLLOWLOCATION, TRUE); curl_setopt($process,CURLOPT_MAXREDIRS,10); //added //curl_setopt($process, CURLOPT_AUTOREFERER, 1); curl_setopt($process,CURLOPT_VERBOSE,TRUE); if ($this->referrer) curl_setopt($process,CURLOPT_REFERER,$this->referrer); if($this->cookies){ foreach($this->cookies as $cookie){ curl_setopt ($process, CURLOPT_COOKIE, $cookie); //echo $cookie; } } $return = $this->redirect_exec($process);//curl_exec($process) or curl_error($process); curl_close($process); set_time_limit($default_exec_time);//setback to default return $return; } function redirect_exec($ch, $curlopt_header = false) { //curl_setopt($ch, CURLOPT_HEADER, true); //curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); $data = curl_exec($ch); $file = fopen(DP_SCRAPE_DATA_CURL_DIR.$this->redirectcount.".html","w"); fwrite($file,$data); fclose($file); $info = curl_getinfo($ch); print_r($info);echo "
"; $http_code = $info['http_code']; if ($http_code == 301 || $http_code == 302 || $http_code == 303) { //list($header) = explode("\r\n\r\n", $data); //print_r($header); $matches = array(); //print_r($data); //Check if the response has a Location to redirect to preg_match('/(Location:|URI:)(.*?)\n/', $data, $matches); $url = trim(array_pop($matches)); //print_r($url); $url_parsed = parse_url($url); //print_r($url_parsed); if (isset($url_parsed['path']) && isset($url) && !empty($url) ) { //echo "
".$url; curl_setopt($ch, CURLOPT_URL, MY_HOST.$url); //echo "
".$url; $this->redirectcount++; return $this->redirect_exec($ch); //return $this->get(MY_HOST.$url); //$this->redirect_exec($ch); } } elseif($http_code == 200){ $matches = array(); preg_match('/(/i', $data, $matches); //print_r($matches); $url = trim(array_pop($matches)); //print_r($url); $url_parsed = parse_url($url); //print_r($url_parsed); if (isset($url_parsed['path']) && isset($url) && !empty($url) ) { curl_setopt($ch, CURLOPT_URL, $url); //echo "
".$url; $this->redirectcount++; sleep(SLEEP_INTERVAL); return $this->redirect_exec($ch); //return $this->get($url); //$this->redirect_exec($ch); } } //echo "data ".$data; $this->redirectcount++; return $data ; // $info['url']; }
where $urls are all the urls containing all query string for a get request
i realised from curl_getinfo , the [request_size ] is getting larger and larger which it shouldnt be.. it should be about the same size. How can i print/echo my http request information to debug?