1
Fork 0
mirror of https://git.lolcat.ca/lolcat/4get.git synced 2024-12-03 23:42:16 -05:00

Compare commits

...

5 commits

Author SHA1 Message Date
lolcat
4e4796bb71 startpage captcha handle 2024-07-29 18:25:25 -04:00
lolcat
ff06bc1f51 add startpage ac option in settings 2024-07-29 10:34:56 -04:00
lolcat
d0ca0f46a7 accidentally fucked with config again 2024-07-29 10:17:21 -04:00
lolcat
8a32827a39 added startpage word definitions, sp images, sp videos, sp news, sp ac 2024-07-29 10:15:17 -04:00
lolcat
143c0c1364 fix yandex image scraper 2024-07-27 11:43:10 -04:00
7 changed files with 854 additions and 46 deletions

View file

@ -6,7 +6,7 @@
## About 4get
https://4get.ca/about
## Try it out
## Official instance
https://4get.ca
## Totally unbiased comparison between alternatives
@ -35,8 +35,9 @@ tl;dr the best way to actually browse for shit.
| DuckDuckGo | DuckDuckGo | YouTube | DuckDuckGo | Soundcloud | Brave |
| Brave | Brave | DuckDuckGo | Brave | | DuckDuckGo |
| Yandex | Yandex | Brave | Google | | Yandex |
| Google | Google | Yandex | Qwant | | Google |
| Qwant | Qwant | Google | Mojeek | | Qwant |
| Google | Google | Yandex | Startpage | | Google |
| Startpage | Startpage | Google | Qwant | | Startpage |
| Qwant | Qwant | Startpage | Mojeek | | Qwant |
| Yep | Yep | Qwant | | | Yep |
| Greppr | Imgur | | | | Marginalia |
| Crowdview | FindThatMeme | | | | YouTube |
@ -51,3 +52,6 @@ Refer to the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/"
## Contact
Shit breaks all the time but I repair it all the time too... Email me here: <b>will (at) lolcat.ca</b> or create an issue.
## License
AGPL

View file

@ -18,7 +18,8 @@ class autocomplete{
"yep" => "https://api.yep.com/ac/?query={searchTerms}",
"marginalia" => "https://search.marginalia.nu/suggest/?partial={searchTerms}",
"yt" => "https://suggestqueries-clients6.youtube.com/complete/search?client=youtube&q={searchTerms}",
"sc" => ""
"sc" => "",
"startpage" => "https://www.startpage.com/suggestions?q={searchTerms}&format=opensearch&segment=startpage.defaultffx&lui=english"
];
/*

View file

@ -11,7 +11,7 @@ class config{
const SERVER_NAME = "4get";
// Will be shown in <meta> tag on home page
const SERVER_SHORT_DESCRIPTION = "They live in our walls!";
const SERVER_SHORT_DESCRIPTION = "4get is a proxy search engine that doesn't suck.";
// Will be shown in server list ping (null for no description)
const SERVER_LONG_DESCRIPTION = null;
@ -111,7 +111,7 @@ class config{
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
// Changing this might break things.
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0";
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0";
// Proxy pool assignments for each scraper
// false = Use server's raw IP

View file

@ -945,6 +945,7 @@ class frontend{
"yandex" => "Yandex",
"brave" => "Brave",
"google" => "Google",
"startpage" => "Startpage",
"qwant" => "Qwant",
"yep" => "Yep",
//"pinterest" => "Pinterest",
@ -964,6 +965,7 @@ class frontend{
"brave" => "Brave",
"yandex" => "Yandex",
"google" => "Google",
"startpage" => "Startpage",
"qwant" => "Qwant"
]
];
@ -976,6 +978,7 @@ class frontend{
"ddg" => "DuckDuckGo",
"brave" => "Brave",
"google" => "Google",
"startpage" => "Startpage",
"qwant" => "Qwant",
"yep" => "Yep",
"mojeek" => "Mojeek"

View file

@ -88,7 +88,7 @@ class startpage{
]
],
"time" => [ // with_date
"display" => "Time fetched",
"display" => "Time posted",
"option" => [
"any" => "Any time",
"d" => "Past 24 hours",
@ -106,6 +106,141 @@ class startpage{
]
];
break;
case "images":
return [
"nsfw" => [ // qadf
"display" => "NSFW",
"option" => [
"yes" => "Yes", // qadf=none
"no" => "No" // qadf=heavy
]
],
"size" => [ // flimgsize
"display" => "Size",
"option" => [
"any" => "Any size",
"Small" => "Small",
"Medium" => "Medium",
"Large" => "Large",
"Wallpaper" => "Wallpaper",
// from here, image-size-select, var prefix = isz:lt,islt:
"qsvgs" => "Larger than 400x300",
"vga" => "Larger than 640x480",
"svga" => "Larger than 800x600",
"xga" => "Larger than 1024x768",
"qsvgs" => "Larger than 400x300",
"2mp" => "Larger than 2 MP (1600x1200)",
"4mp" => "Larger than 4 MP (2272x1704)",
"6mp" => "Larger than 6 MP (2816x2112)",
"8mp" => "Larger than 8 MP (3264x2448)",
"10mp" => "Larger than 10 MP (3648x2736)",
"12mp" => "Larger than 12 MP (4096x3072)",
"15mp" => "Larger than 15 MP (4480x3360)",
"20mp" => "Larger than 20 MP (5120x3840)",
"40mp" => "Larger than 40 MP (7216x5412)",
"70mp" => "Larger than 70 MP (9600x7200)"
]
],
"color" => [ // flimgcolor
"display" => "Color",
"option" => [
"any" => "Any color",
// from here, var prefix = ic:
"color" => "Color only",
"bnw" => "Black & white", // set to "gray"
// from here, var prefix = ic:specific,isc:
"red" => "Red",
"orange" => "Orange",
"yellow" => "Yellow",
"green" => "Green",
"teal" => "Teal",
"blue" => "Blue",
"purple" => "Purple",
"pink" => "Pink",
"white" => "White",
"gray" => "Gray",
"black" => "Black",
"brown" => "Brown"
]
],
"type" => [ // flimgtype
"display" => "Type",
"option" => [
"any" => "Any type",
"AnimatedGif" => "Animated GIF",
"Clipart" => "Clip Art",
"Line" => "Line Drawing",
"Photo" => "Photograph",
"Transparent" => "Transparent Background"
]
],
"license" => [ // flimglicense
"display" => "License",
"option" => [
"any" => "Any license",
"p" => "Public domain",
"s" => "Free to share",
"sc" => "Free to share commercially",
"m" => "Free to modify",
"mc" => "Free to modify commercially"
]
]
];
break;
case "videos":
return [
"nsfw" => [ // qadf
"display" => "NSFW",
"option" => [
"yes" => "Yes", // qadf=none
"no" => "No" // qadf=heavy
]
],
"sort" => [
"display" => "Sort by",
"option" => [
"relevance" => "Most relevant",
"popular" => "Most popular",
"recent" => "Most recent"
]
],
"duration" => [ // with_duration
"display" => "Duration",
"option" => [
"any" => "Any duration",
"short" => "Short",
"medium" => "Medium",
"long" => "Long"
]
]
];
break;
case "news":
return [
"nsfw" => [ // qadf
"display" => "NSFW",
"option" => [
"yes" => "Yes", // qadf=none
"no" => "No" // qadf=heavy
]
],
"time" => [ // with_date
"display" => "Time posted",
"option" => [
"any" => "Any time",
"d" => "Past 24 hours",
"w" => "Past week",
"m" => "Past month"
]
]
];
break;
//preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEazerbaijaniN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius; Domain=startpage.com; Expires=Mon, 28 Oct 2024 20:21:58 GMT; Secure; Path=/
//preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius; Domain=startpage.com; Expires=Mon, 28 Oct 2024 20:22:52 GMT; Secure; Path=/
}
}
@ -273,6 +408,8 @@ class startpage{
//$html = file_get_contents("scraper/startpage.html");
}
$this->detect_captcha($html);
if(
preg_match(
'/React\.createElement\(UIStartpage\.AppSerpWeb, ?(.+)\),$/m',
@ -291,6 +428,8 @@ class startpage{
throw new Exception("Failed to decode JSON");
}
//print_r($json);
$out = [
"status" => "ok",
"spelling" => [
@ -308,40 +447,7 @@ class startpage{
];
// get npt
foreach($json["render"]["presenter"]["pagination"]["pages"] as $page){
if($page["name"] == "Next"){
parse_str(
explode(
"?",
$page["url"],
2
)[1],
$str
);
$out["npt"] =
$this->backend->store(
http_build_query(
[
"lui" => "english",
"language" => "english",
"query" => $str["q"],
"cat" => "web",
"sc" => $str["sc"],
"t" => "device",
"segment" => "startpage.udog",
"page" => $str["page"]
]
),
"web",
$proxy
);
break;
}
}
$out["npt"] = $this->parse_npt($json, "web", $proxy);
foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
@ -456,6 +562,86 @@ class startpage{
];
}
break;
case "spellsuggest-google":
$out["spelling"] =
[
"type" => "including",
"using" => $json["render"]["query"],
"correction" => $category["results"][0]["query"]
];
break;
case "dictionary-qi":
foreach($category["results"] as $result){
$answer = [
"title" => $result["word"],
"description" => [],
"url" => null,
"thumb" => null,
"table" => [],
"sublink" => []
];
foreach($result["lexical_categories"] as $lexic_type => $definitions){
$answer["description"][] = [
"type" => "title",
"value" => $lexic_type
];
$i = 0;
foreach($definitions as $definition){
$text_definition = trim($definition["definition"]);
$text_example = trim($definition["example"]);
$text_synonyms = implode(", ", $definition["synonyms"]);
if($text_definition != ""){
$i++;
$c = count($answer["description"]) - 1;
if(
$c !== 0 &&
$answer["description"][$c]["type"] == "text"
){
$answer["description"][$c]["value"] .=
"\n\n" . $i . ". " . $text_definition;
}else{
$answer["description"][] = [
"type" => "text",
"value" => $i . ". " . $text_definition
];
}
}
if($text_example != ""){
$answer["description"][] = [
"type" => "quote",
"value" => $text_example
];
}
if($text_synonyms != ""){
$answer["description"][] = [
"type" => "text",
"value" => "Synonyms: " . $text_synonyms
];
}
}
}
$out["answer"][] = $answer;
}
break;
}
}
@ -568,9 +754,11 @@ class startpage{
$answer["description"][] = [
"type" => "text",
"value" =>
$this->fuckhtml
->getTextContent(
$description[0]
html_entity_decode(
$this->fuckhtml
->getTextContent(
$description[0]
)
)
];
}
@ -772,6 +960,488 @@ class startpage{
return $out;
}
public function image($get){
if($get["npt"]){
[$post, $proxy] = $this->backend->get($get["npt"], "images");
try{
$html = $this->get(
$proxy,
"https://www.startpage.com/sp/search",
$post,
true
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
try{
$proxy = $this->backend->get_ip();
$params = [
"query" => $get["s"],
"cat" => "images",
"pl" => "opensearch"
];
if($get["nsfw"] == "no"){
$params["qadf"] = "heavy";
}
if($get["size"] != "any"){
if(
$get["size"] == "Small" ||
$get["size"] == "Medium" ||
$get["size"] == "Large" ||
$get["size"] == "Wallpaper"
){
$params["flimgsize"] = $get["size"];
}else{
$params["image-size-select"] = "isz:lt,islt:" . $get["size"];
}
}
if($get["color"] != "any"){
if($get["color"] == "color"){
$params["flimgcolor"] = "ic:color";
}elseif($get["color"] == "bnw"){
$params["flimgcolor"] = "ic:gray";
}else{
$params["flimgcolor"] = "ic:specific,isc:" . $get["color"];
}
}
if($get["type"] != "any"){
$params["flimgtype"] = $get["type"];
}
if($get["license"] != "any"){
$params["flimglicense"] = $get["license"];
}
try{
$html = $this->get(
$proxy,
"https://www.startpage.com/sp/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
//$html = file_get_contents("scraper/startpage.html");
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}
$this->detect_captcha($html);
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
if(
preg_match(
'/React\.createElement\(UIStartpage\.AppSerpImages, ?(.+)\),$/m',
$html,
$matches
) === 0
){
throw new Exception("Failed to grep JSON object");
}
$json = json_decode($matches[1], true);
if($json === null){
throw new Exception("Failed to decode JSON object");
}
// get npt
$out["npt"] = $this->parse_npt($json, "images", $proxy);
// get images
foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
if($category["display_type"] != "images-bing"){
// ignore ads and !! suggestions !! @todo
continue;
}
foreach($category["results"] as $image){
$out["image"][] = [
"title" => $this->titledots($image["title"]),
"source" => [
[
"url" => $this->unshitimage($image["clickUrl"]),
"width" => (int)$image["width"],
"height" => (int)$image["height"]
],
[
"url" => $this->unshitimage($image["thumbnailUrl"]),
"width" => (int)$image["thumbnailWidth"],
"height" => (int)$image["thumbnailHeight"]
]
],
"url" => $image["altClickUrl"]
];
}
}
return $out;
}
public function video($get){
if($get["npt"]){
[$post, $proxy] = $this->backend->get($get["npt"], "videos");
try{
$html = $this->get(
$proxy,
"https://www.startpage.com/sp/search",
$post,
true
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
try{
$proxy = $this->backend->get_ip();
$params = [
"query" => $get["s"],
"cat" => "video",
"pl" => "opensearch"
];
if($get["nsfw"] == "no"){
$params["qadf"] = "heavy";
}
if($get["sort"] != "relevance"){
$params["sort_by"] = $get["sort"];
}
if($get["duration"] != "any"){
$params["with_duration"] = $get["duration"];
}
try{
$html = $this->get(
$proxy,
"https://www.startpage.com/sp/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
//$html = file_get_contents("scraper/startpage.html");
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}
$this->detect_captcha($html);
if(
preg_match(
'/React\.createElement\(UIStartpage\.AppSerpVideos, ?(.+)\),$/m',
$html,
$matches
) === 0
){
throw new Exception("Failed to get JSON object");
}
$json = json_decode($matches[1], true);
if($json === null){
throw new Exception("Failed to decode JSON object");
}
$out = [
"status" => "ok",
"npt" => null,
"video" => [],
"author" => [],
"livestream" => [],
"playlist" => [],
"reel" => []
];
// get npt
$out["npt"] = $this->parse_npt($json, "video", $proxy);
// get results
foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
if($category["display_type"] == "video-youtube"){
foreach($category["results"] as $video){
if(
isset($video["thumbnailUrl"]) &&
$video["thumbnailUrl"] !== null
){
$thumb = [
"ratio" => "16:9",
"url" => $this->unshitimage($video["thumbnailUrl"])
];
}else{
$thumb = [
"ratio" => null,
"url" => null
];
}
$out["video"][] = [
"title" => $video["title"],
"description" => $this->limitstrlen($video["description"]),
"author" => [
"name" => $video["channelTitle"],
"url" => null,
"avatar" => null
],
"date" => strtotime($video["publishDate"]),
"duration" => $this->hms2int($video["duration"]),
"views" => (int)$video["viewCount"],
"thumb" => $thumb,
"url" => $video["clickUrl"]
];
}
}
}
return $out;
}
public function news($get){
if($get["npt"]){
[$post, $proxy] = $this->backend->get($get["npt"], "news");
try{
$html = $this->get(
$proxy,
"https://www.startpage.com/sp/search",
$post,
true
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
try{
$proxy = $this->backend->get_ip();
$params = [
"query" => $get["s"],
"cat" => "news",
"pl" => "opensearch"
];
if($get["nsfw"] == "no"){
$params["qadf"] = "heavy";
}
if($get["time"] != "any"){
$params["with_date"] = $get["time"];
}
try{
$html = $this->get(
$proxy,
"https://www.startpage.com/sp/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
//$html = file_get_contents("scraper/startpage.html");
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}
$this->detect_captcha($html);
if(
preg_match(
'/React\.createElement\(UIStartpage\.AppSerpNews, ?(.+)\),$/m',
$html,
$matches
) === 0
){
throw new Exception("Failed to get JSON object");
}
$json = json_decode($matches[1], true);
if($json === null){
throw new Exception("Failed to decode JSON object");
}
$out = [
"status" => "ok",
"npt" => null,
"news" => []
];
// get npt
$out["npt"] = $this->parse_npt($json, "news", $proxy);
foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
if($category["display_type"] != "news-bing"){
// unsupported category
continue;
}
foreach($category["results"] as $news){
if(
isset($news["thumbnailUrl"]) &&
$news["thumbnailUrl"] !== null
){
$thumb = [
"ratio" => "16:9",
"url" => $this->unshitimage($news["thumbnailUrl"])
];
}else{
$thumb = [
"ratio" => null,
"url" => null
];
}
$out["news"][] = [
"title" => $this->titledots($this->remove_penguins($news["title"])),
"author" => $news["source"],
"description" => $this->titledots($this->remove_penguins($news["description"])),
"date" => (int)substr((string)$news["date"], 0, -3),
"thumb" => $thumb,
"url" => $news["clickUrl"]
];
}
}
return $out;
}
private function parse_npt($json, $pagetype, $proxy){
foreach($json["render"]["presenter"]["pagination"]["pages"] as $page){
if($page["name"] == "Next"){
parse_str(
explode(
"?",
$page["url"],
2
)[1],
$str
);
return
$this->backend->store(
http_build_query(
[
"lui" => "english",
"language" => "english",
"query" => $str["q"],
"cat" => $pagetype,
"sc" => $str["sc"],
"t" => "device",
"segment" => "startpage.udog",
"page" => $str["page"]
]
),
$pagetype,
$proxy
);
break;
}
}
return null;
}
private function unshitimage($url){
$query = parse_url($url, PHP_URL_QUERY);
@ -789,14 +1459,121 @@ class startpage{
)[0];
}
if(
strpos($query["piurl"], "bing.net/") ||
strpos($query["piurl"], "bing.com/")
){
return
explode(
"&",
$query["piurl"],
2
)[0];
}
return $query["piurl"];
}
return $url;
}
private function limitstrlen($text){
return
explode(
"\n",
wordwrap(
str_replace(
["\n\r", "\r\n", "\n", "\r"],
" ",
$text
),
300,
"\n"
),
2
)[0];
}
private function titledots($title){
return trim($title, " .\t\n\r\0\x0B");
}
private function hms2int($time){
$parts = explode(":", $time, 3);
$time = 0;
if(count($parts) === 3){
// hours
$time = $time + ((int)$parts[0] * 3600);
array_shift($parts);
}
if(count($parts) === 2){
// minutes
$time = $time + ((int)$parts[0] * 60);
array_shift($parts);
}
// seconds
$time = $time + (int)$parts[0];
return $time;
}
private function remove_penguins($text){
return str_replace(
["", ""],
"",
$text
);
}
private function detect_captcha($html){
$this->fuckhtml->load($html);
$title =
$this->fuckhtml
->getElementsByTagName(
"title"
);
if(
count($title) !== 0 &&
$title[0]["innerHTML"] == "Redirecting..."
){
// check if it's a captcha
$as =
$this->fuckhtml
->getElementsByTagName(
"a"
);
foreach($as as $a){
if(
strpos(
$this->fuckhtml
->getTextContent(
$a["innerHTML"]
),
"https://www.startpage.com/sp/captcha"
) !== false
){
throw new Exception("Startpage returned a captcha");
}
}
throw new Exception("Startpage redirected the scraper to an unhandled page");
}
}
}

View file

@ -751,6 +751,13 @@ class yandex{
"url" => htmlspecialchars_decode($image["snippet"]["url"])
];
// add preview URL
$tmp["source"][] = [
"url" => htmlspecialchars_decode($image["viewerData"]["preview"][0]["url"]),
"width" => (int)$image["viewerData"]["preview"][0]["w"],
"height" => (int)$image["viewerData"]["preview"][0]["h"],
];
foreach($image["viewerData"]["dups"] as $dup){
$tmp["source"][] = [
@ -767,8 +774,8 @@ class yandex{
"https://",
htmlspecialchars_decode($image["viewerData"]["thumb"]["url"])
),
"width" => (int)$image["viewerData"]["thumb"]["size"]["width"],
"height" => (int)$image["viewerData"]["thumb"]["size"]["height"]
"width" => (int)$image["viewerData"]["thumb"]["w"],
"height" => (int)$image["viewerData"]["thumb"]["h"]
];
$out["image"][] = $tmp;

View file

@ -79,6 +79,10 @@ $settings = [
"value" => "google",
"text" => "Google"
],
[
"value" => "startpage",
"text" => "Startpage"
],
[
"value" => "qwant",
"text" => "Qwant"
@ -183,6 +187,10 @@ $settings = [
"value" => "google",
"text" => "Google"
],
[
"value" => "startpage",
"text" => "Startpage"
],
[
"value" => "qwant",
"text" => "Qwant"
@ -229,6 +237,10 @@ $settings = [
"value" => "google",
"text" => "Google"
],
[
"value" => "startpage",
"text" => "Startpage"
],
[
"value" => "qwant",
"text" => "Qwant"
@ -251,6 +263,10 @@ $settings = [
"value" => "google",
"text" => "Google"
],
[
"value" => "startpage",
"text" => "Startpage"
],
[
"value" => "qwant",
"text" => "Qwant"