0
Fork 0
mirror of https://git.lolcat.ca/lolcat/4get.git synced 2025-01-15 00:10:33 -05:00
4get/scraper/wiby.php

247 lines
4.5 KiB
PHP
Raw Normal View History

2023-07-22 14:41:14 -04:00
<?php
class wiby{
public function __construct(){
2023-11-07 08:04:56 -05:00
include "lib/backend.php";
$this->backend = new backend("wiby");
2023-07-22 14:41:14 -04:00
}
public function getfilters($page){
if($page != "web"){
return [];
}
return [
"nsfw" => [
"display" => "NSFW",
"option" => [
"yes" => "Yes",
"no" => "No"
]
],
"date" => [
"display" => "Time posted",
"option" => [
"any" => "Any time",
"day" => "Past day",
"week" => "Past week",
"month" => "Past month",
"year" => "Past year",
]
]
];
}
2023-11-07 08:04:56 -05:00
private function get($proxy, $url, $get = [], $nsfw){
2023-07-22 14:41:14 -04:00
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
2023-11-07 08:04:56 -05:00
["User-Agent: " . config::USER_AGENT,
2023-07-22 14:41:14 -04:00
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Cookie: ws={$nsfw}",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
2023-11-07 08:04:56 -05:00
$this->backend->assign_proxy($curlproc, $proxy);
2023-07-22 14:41:14 -04:00
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
2023-11-07 08:04:56 -05:00
[$q, $proxy] = $this->backend->get($get["npt"], "web");
$q = json_decode($q, true);
2023-07-22 14:41:14 -04:00
$nsfw = $q["nsfw"];
unset($q["nsfw"]);
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
2023-11-07 08:04:56 -05:00
$proxy = $this->backend->get_ip();
2023-07-22 14:41:14 -04:00
$date = $get["date"];
$nsfw = $get["nsfw"] == "yes" ? "0" : "1";
$search =
str_replace(
[
"!g",
"!gi",
"!gv",
"!gm",
"!b",
"!bi",
"!bv",
"!bm",
"!td",
"!tw",
"!tm",
"!ty",
"&g",
"&gi",
"&gv",
"&gm",
"&b",
"&bi",
"&bv",
"&bm",
"&td",
"&tw",
"&tm",
"&ty",
],
"",
$search
);
switch($date){
case "day": $search = "!td " . $search; break;
case "week": $search = "!tw " . $search; break;
case "month": $search = "!tm " . $search; break;
case "year": $search = "!ty " . $search; break;
}
$q = [
"q" => $search
];
}
try{
$html = $this->get(
2023-11-07 08:04:56 -05:00
$proxy,
2023-07-22 14:41:14 -04:00
"https://wiby.me/",
$q,
$nsfw
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
preg_match(
'/<p class="pin"><blockquote>(?:<\/p>)?<br><a class="more" href="\/\?q=[^"]+&p=([0-9]+)">Find more\.\.\.<\/a><\/blockquote>/',
$html,
$nextpage
);
if(count($nextpage) === 0){
$nextpage = null;
}else{
$nextpage =
2023-11-07 08:04:56 -05:00
$this->backend->store(
2023-07-22 14:41:14 -04:00
json_encode([
"q" => $q["q"],
"p" => (int)$nextpage[1],
"nsfw" => $nsfw
]),
2023-11-07 08:04:56 -05:00
"web",
$proxy
2023-07-22 14:41:14 -04:00
);
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => $nextpage,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
preg_match_all(
'/<blockquote>[\s]*<a .* href="(.*)">(.*)<\/a>.*<p>(.*)<\/p>[\s]*<\/blockquote>/Ui',
$html,
$links
);
for($i=0; $i<count($links[0]); $i++){
$out["web"][] = [
"title" => $this->unescapehtml(trim($links[2][$i])),
2024-04-21 19:31:56 -04:00
"description" => $this->unescapehtml(trim(strip_tags($links[3][$i]), ".\n\r ")),
2023-07-22 14:41:14 -04:00
"url" => trim($links[1][$i]),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
private function unescapehtml($str){
return html_entity_decode(
str_replace(
[
"<br>",
"<br/>",
"</br>",
"<BR>",
"<BR/>",
"</BR>",
],
"\n",
$str
),
ENT_QUOTES | ENT_XML1, 'UTF-8'
);
}
}