From f2eb164d40340cf221cb7ad457ab35492da4d308 Mon Sep 17 00:00:00 2001 From: lolcat Date: Sat, 27 Apr 2024 14:25:39 -0400 Subject: [PATCH] qwant gibberish check --- scraper/qwant.php | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/scraper/qwant.php b/scraper/qwant.php index a8b69fe..7f441e5 100644 --- a/scraper/qwant.php +++ b/scraper/qwant.php @@ -453,6 +453,8 @@ class qwant{ switch($item["type"]){ // ignores ads case "web": + + $first_iteration = true; foreach($item["items"] as $result){ if(isset($result["thumbnailUrl"])){ @@ -483,6 +485,25 @@ class qwant{ } } + // detect gibberish results + if( + $first_iteration && + preg_match( + "/^" . + preg_quote( + $this->trimdots( + $result["source"] + ), + "/" + ) . + "/", + $result["url"] + ) !== 1 + ){ + + throw new Exception("Qwant returned gibberish results"); + } + $out["web"][] = [ "title" => $this->trimdots($result["title"]), "description" => $this->trimdots($result["desc"]), @@ -493,6 +514,8 @@ class qwant{ "sublink" => $sublinks, "table" => [] ]; + + $first_iteration = false; } break;