1
Fork 0
mirror of https://git.lolcat.ca/lolcat/4get.git synced 2024-12-24 23:56:34 -05:00

fixed yandex image scraper

This commit is contained in:
lolcat 2023-11-09 08:06:14 -05:00
parent 5236452f45
commit 165d80f80b

View file

@ -636,6 +636,7 @@ class yandex{
throw new Exception("Failed to get JSON"); throw new Exception("Failed to get JSON");
} }
/* /*
$handle = fopen("scraper/yandex.json", "r"); $handle = fopen("scraper/yandex.json", "r");
$json = fread($handle, filesize("scraper/yandex.json")); $json = fread($handle, filesize("scraper/yandex.json"));
@ -656,68 +657,80 @@ class yandex{
throw new Exception("Failed to decode JSON"); throw new Exception("Failed to decode JSON");
} }
// get html
$html = "";
foreach($json["blocks"] as $block){
$html .= $block["html"];
}
$this->fuckhtml->load($html);
$div = $this->fuckhtml->getElementsByTagName("div");
$out = [ $out = [
"status" => "ok", "status" => "ok",
"npt" => null, "npt" => null,
"image" => [] "image" => []
]; ];
// check for next page // get html
if( $html = "";
count( foreach($json["blocks"] as $block){
$this->fuckhtml
->getElementsByClassName(
"more more_direction_next",
$div
)
) !== 0
){
$request["nsfw"] = $nsfw; $html .= $block["html"];
if(isset($request["p"])){ // get next page
if(
isset($block["params"]["nextPageUrl"]) &&
!empty($block["params"]["nextPageUrl"])
){
$request["p"]++; $request["nsfw"] = $nsfw;
}else{
$request["p"] = 1; if(isset($request["p"])){
$request["p"]++;
}else{
$request["p"] = 1;
}
$out["npt"] =
$this->backend->store(
json_encode($request),
"images",
$proxy
);
} }
$out["npt"] =
$this->backend->store(
json_encode($request),
"images",
$proxy
);
} }
$this->fuckhtml->load($html);
// get search results // get search results
$data = null;
foreach( foreach(
$this->fuckhtml $this->fuckhtml
->getElementsByClassName( ->getElementsByClassName(
"serp-item serp-item_type_search", "Root",
$div "div"
) ) as $div
as $image
){ ){
$image = if(isset($div["attributes"]["data-state"])){
json_decode(
$image $tmp = json_decode(
["attributes"] $this->fuckhtml
["data-bem"], ->getTextContent(
$div["attributes"]["data-state"]
),
true true
)["serp-item"]; );
if(isset($tmp["initialState"]["serpList"])){
$data = $tmp;
break;
}
}
}
if($data === null){
throw new Exception("Failed to extract JSON");
}
foreach($data["initialState"]["serpList"]["items"]["entities"] as $image){
$title = [html_entity_decode($image["snippet"]["title"], ENT_QUOTES | ENT_HTML5)]; $title = [html_entity_decode($image["snippet"]["title"], ENT_QUOTES | ENT_HTML5)];
@ -738,7 +751,7 @@ class yandex{
"url" => htmlspecialchars_decode($image["snippet"]["url"]) "url" => htmlspecialchars_decode($image["snippet"]["url"])
]; ];
foreach($image["dups"] as $dup){ foreach($image["viewerData"]["dups"] as $dup){
$tmp["source"][] = [ $tmp["source"][] = [
"url" => htmlspecialchars_decode($dup["url"]), "url" => htmlspecialchars_decode($dup["url"]),
@ -752,10 +765,10 @@ class yandex{
preg_replace( preg_replace(
'/^\/\//', '/^\/\//',
"https://", "https://",
htmlspecialchars_decode($image["thumb"]["url"]) htmlspecialchars_decode($image["viewerData"]["thumb"]["url"])
), ),
"width" => (int)$image["thumb"]["size"]["width"], "width" => (int)$image["viewerData"]["thumb"]["size"]["width"],
"height" => (int)$image["thumb"]["size"]["height"] "height" => (int)$image["viewerData"]["thumb"]["size"]["height"]
]; ];
$out["image"][] = $tmp; $out["image"][] = $tmp;