From 4e4796bb714a1672a6f9b6ea6c58a239a2606dd8 Mon Sep 17 00:00:00 2001 From: lolcat Date: Mon, 29 Jul 2024 18:25:25 -0400 Subject: [PATCH] startpage captcha handle --- scraper/startpage.php | 50 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/scraper/startpage.php b/scraper/startpage.php index ca15779..ef9def9 100644 --- a/scraper/startpage.php +++ b/scraper/startpage.php @@ -408,6 +408,8 @@ class startpage{ //$html = file_get_contents("scraper/startpage.html"); } + $this->detect_captcha($html); + if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpWeb, ?(.+)\),$/m', @@ -1057,6 +1059,8 @@ class startpage{ } } + $this->detect_captcha($html); + $out = [ "status" => "ok", "npt" => null, @@ -1186,6 +1190,8 @@ class startpage{ } } + $this->detect_captcha($html); + if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpVideos, ?(.+)\),$/m', @@ -1326,6 +1332,8 @@ class startpage{ } } + $this->detect_captcha($html); + if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpNews, ?(.+)\),$/m', @@ -1526,4 +1534,46 @@ class startpage{ $text ); } + + private function detect_captcha($html){ + + $this->fuckhtml->load($html); + + $title = + $this->fuckhtml + ->getElementsByTagName( + "title" + ); + + if( + count($title) !== 0 && + $title[0]["innerHTML"] == "Redirecting..." + ){ + + // check if it's a captcha + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + foreach($as as $a){ + + if( + strpos( + $this->fuckhtml + ->getTextContent( + $a["innerHTML"] + ), + "https://www.startpage.com/sp/captcha" + ) !== false + ){ + + throw new Exception("Startpage returned a captcha"); + } + } + + throw new Exception("Startpage redirected the scraper to an unhandled page"); + } + } }