diff --git a/scraper/startpage.php b/scraper/startpage.php index ca15779..ef9def9 100644 --- a/scraper/startpage.php +++ b/scraper/startpage.php @@ -408,6 +408,8 @@ class startpage{ //$html = file_get_contents("scraper/startpage.html"); } + $this->detect_captcha($html); + if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpWeb, ?(.+)\),$/m', @@ -1057,6 +1059,8 @@ class startpage{ } } + $this->detect_captcha($html); + $out = [ "status" => "ok", "npt" => null, @@ -1186,6 +1190,8 @@ class startpage{ } } + $this->detect_captcha($html); + if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpVideos, ?(.+)\),$/m', @@ -1326,6 +1332,8 @@ class startpage{ } } + $this->detect_captcha($html); + if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpNews, ?(.+)\),$/m', @@ -1526,4 +1534,46 @@ class startpage{ $text ); } + + private function detect_captcha($html){ + + $this->fuckhtml->load($html); + + $title = + $this->fuckhtml + ->getElementsByTagName( + "title" + ); + + if( + count($title) !== 0 && + $title[0]["innerHTML"] == "Redirecting..." + ){ + + // check if it's a captcha + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + foreach($as as $a){ + + if( + strpos( + $this->fuckhtml + ->getTextContent( + $a["innerHTML"] + ), + "https://www.startpage.com/sp/captcha" + ) !== false + ){ + + throw new Exception("Startpage returned a captcha"); + } + } + + throw new Exception("Startpage redirected the scraper to an unhandled page"); + } + } }