backend = new backend("startpage"); include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); } public function getfilters($page){ switch($page){ case "web": return [ "country" => [ "display" => "Country", "option" => [ "any" => "All Regions", "es_AR" => "Argentina", "en_AU" => "Australia", "de_AT" => "Austria", "ru_BY" => "Belarus", "fr_BE" => "Belgium (FR)", "nl_BE" => "Belgium (NL)", "bg_BG" => "Bulgaria", "en_CA" => "Canada (EN)", "fr_CA" => "Canada (FR)", "es_CL" => "Chile", "es_CO" => "Colombia", "cs_CZ" => "Czech Republic", "da_DK" => "Denmark", "ar_EG" => "Egypt", "et_EE" => "Estonia", "fi_FI" => "Finland", "fr_FR" => "France", "de_DE" => "Germany", "el_GR" => "Greece", "hu_HU" => "Hungary", "hi_IN" => "India (HI)", "en_IN" => "India (EN)", "id_ID" => "Indonesia (ID)", "en_ID" => "Indonesia (EN)", "en_IE" => "Ireland", "it_IT" => "Italy", "ja_JP" => "Japan", "ko_KR" => "Korea", "ms_MY" => "Malaysia (MS)", "en_MY" => "Malaysia (EN)", "es_MX" => "Mexico", "nl_NL" => "Netherlands", "en_NZ" => "New Zealand", "no_NO" => "Norway", "es_PE" => "Peru", "fil_PH" => "Philippines (FIL)", "en_PH" => "Philippines (EN)", "pl_PL" => "Poland", "pt_PT" => "Portugal", "ro_RO" => "Romania", "ru_RU" => "Russia", "ms_SG" => "Singapore (MS)", "en_SG" => "Singapore (EN)", "es_ES" => "Spain (ES)", "ca_ES" => "Spain (CA)", "sv_SE" => "Sweden", "de_CH" => "Switzerland (DE)", "fr_CH" => "Switzerland (FR)", "it_CH" => "Switzerland (IT)", "tr_TR" => "Turkey", "uk_UA" => "Ukraine", "en_US" => "US (EN)", "es_US" => "US (ES)", "es_UY" => "Uruguay", "es_VE" => "Venezuela", "vi_VN" => "Vietnam (VI)", "en_VN" => "Vietnam (EN)", "en_ZA" => "South Africa" ] ], "nsfw" => [ // qadf "display" => "NSFW", "option" => [ "yes" => "Yes", // qadf=none "no" => "No" // qadf=heavy ] ], "time" => [ // with_date "display" => "Time posted", "option" => [ "any" => "Any time", "d" => "Past 24 hours", "w" => "Past week", "m" => "Past month", "y" => "Past year", ] ], "extendedsearch" => [ // undefined display, so it wont show in frontend "option" => [ "yes" => "Yes", "no" => "No" ] ] ]; break; case "images": return [ "nsfw" => [ // qadf "display" => "NSFW", "option" => [ "yes" => "Yes", // qadf=none "no" => "No" // qadf=heavy ] ], "size" => [ // flimgsize "display" => "Size", "option" => [ "any" => "Any size", "Small" => "Small", "Medium" => "Medium", "Large" => "Large", "Wallpaper" => "Wallpaper", // from here, image-size-select, var prefix = isz:lt,islt: "qsvgs" => "Larger than 400x300", "vga" => "Larger than 640x480", "svga" => "Larger than 800x600", "xga" => "Larger than 1024x768", "qsvgs" => "Larger than 400x300", "2mp" => "Larger than 2 MP (1600x1200)", "4mp" => "Larger than 4 MP (2272x1704)", "6mp" => "Larger than 6 MP (2816x2112)", "8mp" => "Larger than 8 MP (3264x2448)", "10mp" => "Larger than 10 MP (3648x2736)", "12mp" => "Larger than 12 MP (4096x3072)", "15mp" => "Larger than 15 MP (4480x3360)", "20mp" => "Larger than 20 MP (5120x3840)", "40mp" => "Larger than 40 MP (7216x5412)", "70mp" => "Larger than 70 MP (9600x7200)" ] ], "color" => [ // flimgcolor "display" => "Color", "option" => [ "any" => "Any color", // from here, var prefix = ic: "color" => "Color only", "bnw" => "Black & white", // set to "gray" // from here, var prefix = ic:specific,isc: "red" => "Red", "orange" => "Orange", "yellow" => "Yellow", "green" => "Green", "teal" => "Teal", "blue" => "Blue", "purple" => "Purple", "pink" => "Pink", "white" => "White", "gray" => "Gray", "black" => "Black", "brown" => "Brown" ] ], "type" => [ // flimgtype "display" => "Type", "option" => [ "any" => "Any type", "AnimatedGif" => "Animated GIF", "Clipart" => "Clip Art", "Line" => "Line Drawing", "Photo" => "Photograph", "Transparent" => "Transparent Background" ] ], "license" => [ // flimglicense "display" => "License", "option" => [ "any" => "Any license", "p" => "Public domain", "s" => "Free to share", "sc" => "Free to share commercially", "m" => "Free to modify", "mc" => "Free to modify commercially" ] ] ]; break; case "videos": return [ "nsfw" => [ // qadf "display" => "NSFW", "option" => [ "yes" => "Yes", // qadf=none "no" => "No" // qadf=heavy ] ], "sort" => [ "display" => "Sort by", "option" => [ "relevance" => "Most relevant", "popular" => "Most popular", "recent" => "Most recent" ] ], "duration" => [ // with_duration "display" => "Duration", "option" => [ "any" => "Any duration", "short" => "Short", "medium" => "Medium", "long" => "Long" ] ] ]; break; case "news": return [ "nsfw" => [ // qadf "display" => "NSFW", "option" => [ "yes" => "Yes", // qadf=none "no" => "No" // qadf=heavy ] ], "time" => [ // with_date "display" => "Time posted", "option" => [ "any" => "Any time", "d" => "Past 24 hours", "w" => "Past week", "m" => "Past month" ] ] ]; break; //preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEazerbaijaniN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius; Domain=startpage.com; Expires=Mon, 28 Oct 2024 20:21:58 GMT; Secure; Path=/ //preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius; Domain=startpage.com; Expires=Mon, 28 Oct 2024 20:22:52 GMT; Secure; Path=/ } } private function get($proxy, $url, $get = [], $post = false, $is_xhr = false){ $curlproc = curl_init(); if($post === true){ curl_setopt($curlproc, CURLOPT_POST, true); curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get); }elseif($get !== []){ $get = http_build_query($get); $url .= "?" . $get; } curl_setopt($curlproc, CURLOPT_URL, $url); // http2 bypass curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding if($is_xhr === true){ curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: application/json", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", "Referer: https://www.startpage.com/", "Content-Type: application/json", "Content-Length: " . strlen($get), "Origin: https://www.startpage.com/", "DNT: 1", "Connection: keep-alive", "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius", "Sec-Fetch-Dest: empty", "Sec-Fetch-Mode: cors", "Sec-Fetch-Site: same-origin", "TE: trailers"] ); }elseif($post === true){ curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", "Referer: https://www.startpage.com/", "Content-Type: application/x-www-form-urlencoded", "Content-Length: " . strlen($get), "DNT: 1", "Connection: keep-alive", "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: none", "Sec-Fetch-User: ?1", "Priority: u=0, i", "TE: trailers"] ); }else{ curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", "DNT: 1", "Connection: keep-alive", "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: none", "Sec-Fetch-User: ?1", "Priority: u=0, i", "TE: trailers"] ); } curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); if(curl_errno($curlproc)){ throw new Exception(curl_error($curlproc)); } curl_close($curlproc); return $data; } public function web($get){ if($get["npt"]){ [$post, $proxy] = $this->backend->get($get["npt"], "web"); try{ $html = $this->get( $proxy, "https://www.startpage.com/sp/search", $post, true ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } $get_instant_answer = false; }else{ $proxy = $this->backend->get_ip(); $params = [ "query" => $get["s"], "cat" => "web", "pl" => "opensearch" ]; if($get["nsfw"] == "no"){ $params["qadf"] = "heavy"; $get_instant_answer = false; }else{ $get_instant_answer = true; } if($get["country"] !== "any"){ $params["qsr"] = $get["country"]; } if($get["time"] !== "any"){ $params["with_date"] = $get["time"]; } try{ $html = $this->get( $proxy, "https://www.startpage.com/sp/search", $params ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } //$html = file_get_contents("scraper/startpage.html"); } $this->detect_captcha($html); if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpWeb, ?(.+)\),$/m', $html, $matches ) === 0 ){ throw new Exception("Failed to grep JSON object"); } $json = json_decode($matches[1], true); if($json === null){ throw new Exception("Failed to decode JSON"); } //print_r($json); $out = [ "status" => "ok", "spelling" => [ "type" => "no_correction", "using" => null, "correction" => null ], "npt" => null, "answer" => [], "web" => [], "image" => [], "video" => [], "news" => [], "related" => [] ]; // get npt $out["npt"] = $this->parse_npt($json, "web", $proxy); foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){ if(!isset($category["display_type"])){ continue; } switch($category["display_type"]){ case "web-google": foreach($category["results"] as $result){ $sublinks = []; foreach($result["siteLinks"] as $sublink){ $sublinks[] = [ "title" => $sublink["title"], "description" => null, "url" => $sublink["clickUrl"] ]; } $description = explode( "...", $this->titledots( html_entity_decode( $this->fuckhtml ->getTextContent( $result["description"] ) ) ), 2 ); $date = strtotime(trim($description[0])); if( $date === false || count($description) !== 2 || strlen($description[0]) > 14 ){ // no date found $description = implode( " ... ", $description ); $date = null; }else{ // date found $description = ltrim($description[1]); } $out["web"][] = [ "title" => $this->titledots( html_entity_decode( $this->fuckhtml ->getTextContent( $result["title"] ) ) ), "description" => $description, "url" => $result["clickUrl"], "date" => $date, "type" => "web", "thumb" => [ "url" => null, "ratio" => null ], "sublink" => $sublinks, "table" => [] ]; } break; case "images-qi-top": foreach($category["results"] as $result){ $out["image"][] = [ "title" => $this->titledots( html_entity_decode( $this->fuckhtml ->getTextContent( $result["title"] ) ) ), "source" => [ [ "url" => $result["rawImageUrl"], "width" => (int)$result["width"], "height" => (int)$result["height"] ], [ "url" => $this->unshitimage($result["mdThumbnailUrl"]), "width" => (int)$result["mdThumbnailWidth"], "height" => (int)$result["mdThumbnailHeight"] ] ], "url" => $result["altClickUrl"] ]; } break; case "spellsuggest-google": $out["spelling"] = [ "type" => "including", "using" => $json["render"]["query"], "correction" => $category["results"][0]["query"] ]; break; case "dictionary-qi": foreach($category["results"] as $result){ $answer = [ "title" => $result["word"], "description" => [], "url" => null, "thumb" => null, "table" => [], "sublink" => [] ]; foreach($result["lexical_categories"] as $lexic_type => $definitions){ $answer["description"][] = [ "type" => "title", "value" => $lexic_type ]; $i = 0; foreach($definitions as $definition){ $text_definition = trim($definition["definition"]); $text_example = trim($definition["example"]); $text_synonyms = implode(", ", $definition["synonyms"]); if($text_definition != ""){ $i++; $c = count($answer["description"]) - 1; if( $c !== 0 && $answer["description"][$c]["type"] == "text" ){ $answer["description"][$c]["value"] .= "\n\n" . $i . ". " . $text_definition; }else{ $answer["description"][] = [ "type" => "text", "value" => $i . ". " . $text_definition ]; } } if($text_example != ""){ $answer["description"][] = [ "type" => "quote", "value" => $text_example ]; } if($text_synonyms != ""){ $answer["description"][] = [ "type" => "text", "value" => "Synonyms: " . $text_synonyms ]; } } } $out["answer"][] = $answer; } break; } } // parse instant answers if( $get["extendedsearch"] == "yes" && $get_instant_answer === true ){ // https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=BqZ3inqrAgF701&sr=1 try{ $post = [ "se" => "n0vze2y9dqwy", "q" => $json["render"]["query"], "results" => [], // populate "enableKnowledgePanel" => true, "enableMediaThumbBar" => false, "enableSearchSuggestions" => false, "enableTripadvisorProperties" => [], "enableTripadvisorPlaces" => [], "enableTripadvisorPlacesForLocations" => [], "enableWebProducts" => false, "tripadvisorPartnerId" => null, "tripadvisorMapColorMode" => "light", "tripadvisorDisablesKnowledgePanel" => false, "instantAnswers" => [ "smartAnswers", "youtube", "tripadvisor" ], "iaType" => null, "forceEnhancedKnowledgePanel" => false, "shoppingOnly" => false, "allowAdultProducts" => true, "lang" => "en", "browserLang" => "en-US", "browserTimezone" => "America/New_York", "market" => null, "userLocation" => null, "userDate" => date("Y-m-d"), "userAgentType" => "unknown" ]; foreach($out["web"] as $result){ $post["results"][] = [ "url" => $result["url"], "title" => $result["title"] ]; } $post = json_encode($post, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE); $additional_data = $this->get( $proxy, "https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=" . $json["render"]["callback_sc"] . "&sr=1", $post, true, true ); $additional_data = json_decode($additional_data, true); if($additional_data === null){ throw new Exception("Failed to decode JSON"); // just break out, dont fail completely } if(!isset($additional_data["knowledgePanel"])){ throw new Exception("Response has missing data (knowledgePanel)"); } $additional_data = $additional_data["knowledgePanel"]; $answer = [ "title" => $additional_data["meta"]["title"], "description" => [ [ "type" => "quote", "value" => $additional_data["meta"]["description"] ] ], "url" => $additional_data["meta"]["origWikiUrl"], "thumb" => $additional_data["meta"]["image"], "table" => [], "sublink" => [] ]; // parse html for instant answer $this->fuckhtml->load($additional_data["html"]); $div = $this->fuckhtml ->getElementsByTagName( "div" ); // get description $description = $this->fuckhtml ->getElementsByClassName( "sx-kp-short-extract sx-kp-short-extract-complete", $div ); if(count($description) !== 0){ $answer["description"][] = [ "type" => "text", "value" => html_entity_decode( $this->fuckhtml ->getTextContent( $description[0] ) ) ]; } // get socials $socials = $this->fuckhtml ->getElementsByClassName( "sx-wiki-social-link", "a" ); foreach($socials as $social){ $title = $this->fuckhtml ->getTextContent( $social["attributes"]["title"] ); $url = $this->fuckhtml ->getTextContent( $social["attributes"]["href"] ); switch($title){ case "Official Website": $title = "Website"; break; } $answer["sublink"][$title] = $url; } // get videos $videos = $this->fuckhtml ->getElementsByClassName( "sx-kp-video-grid-item", $div ); foreach($videos as $video){ $this->fuckhtml->load($video); $as = $this->fuckhtml ->getElementsByTagName( "a" ); if(count($as) === 0){ // ?? invalid continue; } $image = $this->fuckhtml ->getElementsByAttributeName( "data-sx-src", "img" ); if(count($image) !== 0){ $thumb = [ "ratio" => "16:9", "url" => $this->fuckhtml ->getTextContent( $image[0]["attributes"]["data-sx-src"] ) ]; }else{ $thumb = [ "ratio" => null, "url" => null ]; } $out["video"][] = [ "title" => $this->fuckhtml ->getTextContent( $as[0]["attributes"]["title"] ), "description" => null, "date" => null, "duration" => null, "views" => null, "thumb" => $thumb, "url" => $this->fuckhtml ->getTextContent( $as[0]["attributes"]["href"] ) ]; } // reset $this->fuckhtml->load($additional_data["html"]); // get table elements $table = $this->fuckhtml ->getElementsByClassName( "sx-infobox", "table" ); if(count($table) !== 0){ $trs = $this->fuckhtml ->getElementsByTagName( "tr" ); foreach($trs as $tr){ $this->fuckhtml->load($tr); // ok so startpage devs cant fucking code a table // td = content // th (AAAHH) = title $tds = $this->fuckhtml ->getElementsByTagName( "td" ); $ths = $this->fuckhtml ->getElementsByTagName( "th" ); if( count($ths) === 1 && count($tds) === 1 ){ $title = $this->fuckhtml ->getTextContent( $ths[0] ); $description = []; $this->fuckhtml->load($tds[0]); $lis = $this->fuckhtml ->getElementsByTagName( "li" ); if(count($lis) !== 0){ foreach($lis as $li){ $description[] = $this->fuckhtml ->getTextContent( $li ); } $description = implode(", ", $description); }else{ $description = $this->fuckhtml ->getTextContent( $tds[0] ); } $answer["table"][$title] = $description; } } } $out["answer"][] = $answer; }catch(Exception $error){ // do nothing //echo "error!"; } } return $out; } public function image($get){ if($get["npt"]){ [$post, $proxy] = $this->backend->get($get["npt"], "images"); try{ $html = $this->get( $proxy, "https://www.startpage.com/sp/search", $post, true ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } }else{ $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } try{ $proxy = $this->backend->get_ip(); $params = [ "query" => $get["s"], "cat" => "images", "pl" => "opensearch" ]; if($get["nsfw"] == "no"){ $params["qadf"] = "heavy"; } if($get["size"] != "any"){ if( $get["size"] == "Small" || $get["size"] == "Medium" || $get["size"] == "Large" || $get["size"] == "Wallpaper" ){ $params["flimgsize"] = $get["size"]; }else{ $params["image-size-select"] = "isz:lt,islt:" . $get["size"]; } } if($get["color"] != "any"){ if($get["color"] == "color"){ $params["flimgcolor"] = "ic:color"; }elseif($get["color"] == "bnw"){ $params["flimgcolor"] = "ic:gray"; }else{ $params["flimgcolor"] = "ic:specific,isc:" . $get["color"]; } } if($get["type"] != "any"){ $params["flimgtype"] = $get["type"]; } if($get["license"] != "any"){ $params["flimglicense"] = $get["license"]; } try{ $html = $this->get( $proxy, "https://www.startpage.com/sp/search", $params ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } //$html = file_get_contents("scraper/startpage.html"); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } } $this->detect_captcha($html); $out = [ "status" => "ok", "npt" => null, "image" => [] ]; if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpImages, ?(.+)\),$/m', $html, $matches ) === 0 ){ throw new Exception("Failed to grep JSON object"); } $json = json_decode($matches[1], true); if($json === null){ throw new Exception("Failed to decode JSON object"); } // get npt $out["npt"] = $this->parse_npt($json, "images", $proxy); // get images foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){ if($category["display_type"] != "images-bing"){ // ignore ads and !! suggestions !! @todo continue; } foreach($category["results"] as $image){ $out["image"][] = [ "title" => $this->titledots($image["title"]), "source" => [ [ "url" => $this->unshitimage($image["clickUrl"]), "width" => (int)$image["width"], "height" => (int)$image["height"] ], [ "url" => $this->unshitimage($image["thumbnailUrl"]), "width" => (int)$image["thumbnailWidth"], "height" => (int)$image["thumbnailHeight"] ] ], "url" => $image["altClickUrl"] ]; } } return $out; } public function video($get){ if($get["npt"]){ [$post, $proxy] = $this->backend->get($get["npt"], "videos"); try{ $html = $this->get( $proxy, "https://www.startpage.com/sp/search", $post, true ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } }else{ $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } try{ $proxy = $this->backend->get_ip(); $params = [ "query" => $get["s"], "cat" => "video", "pl" => "opensearch" ]; if($get["nsfw"] == "no"){ $params["qadf"] = "heavy"; } if($get["sort"] != "relevance"){ $params["sort_by"] = $get["sort"]; } if($get["duration"] != "any"){ $params["with_duration"] = $get["duration"]; } try{ $html = $this->get( $proxy, "https://www.startpage.com/sp/search", $params ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } //$html = file_get_contents("scraper/startpage.html"); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } } $this->detect_captcha($html); if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpVideos, ?(.+)\),$/m', $html, $matches ) === 0 ){ throw new Exception("Failed to get JSON object"); } $json = json_decode($matches[1], true); if($json === null){ throw new Exception("Failed to decode JSON object"); } $out = [ "status" => "ok", "npt" => null, "video" => [], "author" => [], "livestream" => [], "playlist" => [], "reel" => [] ]; // get npt $out["npt"] = $this->parse_npt($json, "video", $proxy); // get results foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){ if($category["display_type"] == "video-youtube"){ foreach($category["results"] as $video){ if( isset($video["thumbnailUrl"]) && $video["thumbnailUrl"] !== null ){ $thumb = [ "ratio" => "16:9", "url" => $this->unshitimage($video["thumbnailUrl"]) ]; }else{ $thumb = [ "ratio" => null, "url" => null ]; } $out["video"][] = [ "title" => $video["title"], "description" => $this->limitstrlen($video["description"]), "author" => [ "name" => $video["channelTitle"], "url" => null, "avatar" => null ], "date" => strtotime($video["publishDate"]), "duration" => $this->hms2int($video["duration"]), "views" => (int)$video["viewCount"], "thumb" => $thumb, "url" => $video["clickUrl"] ]; } } } return $out; } public function news($get){ if($get["npt"]){ [$post, $proxy] = $this->backend->get($get["npt"], "news"); try{ $html = $this->get( $proxy, "https://www.startpage.com/sp/search", $post, true ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } }else{ $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } try{ $proxy = $this->backend->get_ip(); $params = [ "query" => $get["s"], "cat" => "news", "pl" => "opensearch" ]; if($get["nsfw"] == "no"){ $params["qadf"] = "heavy"; } if($get["time"] != "any"){ $params["with_date"] = $get["time"]; } try{ $html = $this->get( $proxy, "https://www.startpage.com/sp/search", $params ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } //$html = file_get_contents("scraper/startpage.html"); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } } $this->detect_captcha($html); if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpNews, ?(.+)\),$/m', $html, $matches ) === 0 ){ throw new Exception("Failed to get JSON object"); } $json = json_decode($matches[1], true); if($json === null){ throw new Exception("Failed to decode JSON object"); } $out = [ "status" => "ok", "npt" => null, "news" => [] ]; // get npt $out["npt"] = $this->parse_npt($json, "news", $proxy); foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){ if($category["display_type"] != "news-bing"){ // unsupported category continue; } foreach($category["results"] as $news){ if( isset($news["thumbnailUrl"]) && $news["thumbnailUrl"] !== null ){ $thumb = [ "ratio" => "16:9", "url" => $this->unshitimage($news["thumbnailUrl"]) ]; }else{ $thumb = [ "ratio" => null, "url" => null ]; } $out["news"][] = [ "title" => $this->titledots($this->remove_penguins($news["title"])), "author" => $news["source"], "description" => $this->titledots($this->remove_penguins($news["description"])), "date" => (int)substr((string)$news["date"], 0, -3), "thumb" => $thumb, "url" => $news["clickUrl"] ]; } } return $out; } private function parse_npt($json, $pagetype, $proxy){ foreach($json["render"]["presenter"]["pagination"]["pages"] as $page){ if($page["name"] == "Next"){ parse_str( explode( "?", $page["url"], 2 )[1], $str ); return $this->backend->store( http_build_query( [ "lui" => "english", "language" => "english", "query" => $str["q"], "cat" => $pagetype, "sc" => $str["sc"], "t" => "device", "segment" => "startpage.udog", "page" => $str["page"] ] ), $pagetype, $proxy ); break; } } return null; } private function unshitimage($url){ $query = parse_url($url, PHP_URL_QUERY); parse_str($query, $query); if(isset($query["piurl"])){ if(strpos($query["piurl"], "gstatic.com/")){ return explode( "&", $query["piurl"], 2 )[0]; } if( strpos($query["piurl"], "bing.net/") || strpos($query["piurl"], "bing.com/") ){ return explode( "&", $query["piurl"], 2 )[0]; } return $query["piurl"]; } return $url; } private function limitstrlen($text){ return explode( "\n", wordwrap( str_replace( ["\n\r", "\r\n", "\n", "\r"], " ", $text ), 300, "\n" ), 2 )[0]; } private function titledots($title){ return trim($title, " .\t\n\r\0\x0B…"); } private function hms2int($time){ $parts = explode(":", $time, 3); $time = 0; if(count($parts) === 3){ // hours $time = $time + ((int)$parts[0] * 3600); array_shift($parts); } if(count($parts) === 2){ // minutes $time = $time + ((int)$parts[0] * 60); array_shift($parts); } // seconds $time = $time + (int)$parts[0]; return $time; } private function remove_penguins($text){ return str_replace( ["", ""], "", $text ); } private function detect_captcha($html){ $this->fuckhtml->load($html); $title = $this->fuckhtml ->getElementsByTagName( "title" ); if( count($title) !== 0 && $title[0]["innerHTML"] == "Redirecting..." ){ // check if it's a captcha $as = $this->fuckhtml ->getElementsByTagName( "a" ); foreach($as as $a){ if( strpos( $this->fuckhtml ->getTextContent( $a["innerHTML"] ), "https://www.startpage.com/sp/captcha" ) !== false ){ throw new Exception("Startpage returned a captcha"); } } throw new Exception("Startpage redirected the scraper to an unhandled page"); } } }