mirror of
https://git.lolcat.ca/lolcat/4get.git
synced 2024-12-24 23:56:34 -05:00
fixed yandex image scraper
This commit is contained in:
parent
5236452f45
commit
165d80f80b
1 changed files with 59 additions and 46 deletions
|
@ -636,6 +636,7 @@ class yandex{
|
||||||
|
|
||||||
throw new Exception("Failed to get JSON");
|
throw new Exception("Failed to get JSON");
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
$handle = fopen("scraper/yandex.json", "r");
|
$handle = fopen("scraper/yandex.json", "r");
|
||||||
$json = fread($handle, filesize("scraper/yandex.json"));
|
$json = fread($handle, filesize("scraper/yandex.json"));
|
||||||
|
@ -656,68 +657,80 @@ class yandex{
|
||||||
throw new Exception("Failed to decode JSON");
|
throw new Exception("Failed to decode JSON");
|
||||||
}
|
}
|
||||||
|
|
||||||
// get html
|
|
||||||
$html = "";
|
|
||||||
foreach($json["blocks"] as $block){
|
|
||||||
|
|
||||||
$html .= $block["html"];
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->fuckhtml->load($html);
|
|
||||||
$div = $this->fuckhtml->getElementsByTagName("div");
|
|
||||||
|
|
||||||
$out = [
|
$out = [
|
||||||
"status" => "ok",
|
"status" => "ok",
|
||||||
"npt" => null,
|
"npt" => null,
|
||||||
"image" => []
|
"image" => []
|
||||||
];
|
];
|
||||||
|
|
||||||
// check for next page
|
// get html
|
||||||
if(
|
$html = "";
|
||||||
count(
|
foreach($json["blocks"] as $block){
|
||||||
$this->fuckhtml
|
|
||||||
->getElementsByClassName(
|
|
||||||
"more more_direction_next",
|
|
||||||
$div
|
|
||||||
)
|
|
||||||
) !== 0
|
|
||||||
){
|
|
||||||
|
|
||||||
$request["nsfw"] = $nsfw;
|
$html .= $block["html"];
|
||||||
|
|
||||||
if(isset($request["p"])){
|
// get next page
|
||||||
|
if(
|
||||||
|
isset($block["params"]["nextPageUrl"]) &&
|
||||||
|
!empty($block["params"]["nextPageUrl"])
|
||||||
|
){
|
||||||
|
|
||||||
$request["p"]++;
|
$request["nsfw"] = $nsfw;
|
||||||
}else{
|
|
||||||
|
|
||||||
$request["p"] = 1;
|
if(isset($request["p"])){
|
||||||
|
|
||||||
|
$request["p"]++;
|
||||||
|
}else{
|
||||||
|
|
||||||
|
$request["p"] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
$out["npt"] =
|
||||||
|
$this->backend->store(
|
||||||
|
json_encode($request),
|
||||||
|
"images",
|
||||||
|
$proxy
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
$out["npt"] =
|
|
||||||
$this->backend->store(
|
|
||||||
json_encode($request),
|
|
||||||
"images",
|
|
||||||
$proxy
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$this->fuckhtml->load($html);
|
||||||
|
|
||||||
// get search results
|
// get search results
|
||||||
|
$data = null;
|
||||||
|
|
||||||
foreach(
|
foreach(
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
->getElementsByClassName(
|
->getElementsByClassName(
|
||||||
"serp-item serp-item_type_search",
|
"Root",
|
||||||
$div
|
"div"
|
||||||
)
|
) as $div
|
||||||
as $image
|
|
||||||
){
|
){
|
||||||
|
|
||||||
$image =
|
if(isset($div["attributes"]["data-state"])){
|
||||||
json_decode(
|
|
||||||
$image
|
$tmp = json_decode(
|
||||||
["attributes"]
|
$this->fuckhtml
|
||||||
["data-bem"],
|
->getTextContent(
|
||||||
|
$div["attributes"]["data-state"]
|
||||||
|
),
|
||||||
true
|
true
|
||||||
)["serp-item"];
|
);
|
||||||
|
|
||||||
|
if(isset($tmp["initialState"]["serpList"])){
|
||||||
|
|
||||||
|
$data = $tmp;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if($data === null){
|
||||||
|
|
||||||
|
throw new Exception("Failed to extract JSON");
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach($data["initialState"]["serpList"]["items"]["entities"] as $image){
|
||||||
|
|
||||||
$title = [html_entity_decode($image["snippet"]["title"], ENT_QUOTES | ENT_HTML5)];
|
$title = [html_entity_decode($image["snippet"]["title"], ENT_QUOTES | ENT_HTML5)];
|
||||||
|
|
||||||
|
@ -738,7 +751,7 @@ class yandex{
|
||||||
"url" => htmlspecialchars_decode($image["snippet"]["url"])
|
"url" => htmlspecialchars_decode($image["snippet"]["url"])
|
||||||
];
|
];
|
||||||
|
|
||||||
foreach($image["dups"] as $dup){
|
foreach($image["viewerData"]["dups"] as $dup){
|
||||||
|
|
||||||
$tmp["source"][] = [
|
$tmp["source"][] = [
|
||||||
"url" => htmlspecialchars_decode($dup["url"]),
|
"url" => htmlspecialchars_decode($dup["url"]),
|
||||||
|
@ -752,10 +765,10 @@ class yandex{
|
||||||
preg_replace(
|
preg_replace(
|
||||||
'/^\/\//',
|
'/^\/\//',
|
||||||
"https://",
|
"https://",
|
||||||
htmlspecialchars_decode($image["thumb"]["url"])
|
htmlspecialchars_decode($image["viewerData"]["thumb"]["url"])
|
||||||
),
|
),
|
||||||
"width" => (int)$image["thumb"]["size"]["width"],
|
"width" => (int)$image["viewerData"]["thumb"]["size"]["width"],
|
||||||
"height" => (int)$image["thumb"]["size"]["height"]
|
"height" => (int)$image["viewerData"]["thumb"]["size"]["height"]
|
||||||
];
|
];
|
||||||
|
|
||||||
$out["image"][] = $tmp;
|
$out["image"][] = $tmp;
|
||||||
|
|
Loading…
Reference in a new issue