mirror of
https://git.lolcat.ca/lolcat/4get.git
synced 2024-11-08 17:43:07 -05:00
Compare commits
9 commits
b98a393421
...
53d40c6e4e
Author | SHA1 | Date | |
---|---|---|---|
|
53d40c6e4e | ||
|
feb0a6dfc3 | ||
|
9c00182b2e | ||
|
03ccd75f4b | ||
|
7a91eb7839 | ||
|
6df9d17ada | ||
|
8161f8e7b8 | ||
|
d3fe37e7b6 | ||
|
e252bf4fce |
3 changed files with 199 additions and 108 deletions
|
@ -703,6 +703,43 @@ class google{
|
||||||
}
|
}
|
||||||
|
|
||||||
// reset
|
// reset
|
||||||
|
$this->fuckhtml->load($result_div);
|
||||||
|
}else{
|
||||||
|
|
||||||
|
// get the "Did you mean?" prompt
|
||||||
|
$taw =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementById(
|
||||||
|
"taw"
|
||||||
|
);
|
||||||
|
|
||||||
|
if($taw){
|
||||||
|
|
||||||
|
$this->fuckhtml->load($taw);
|
||||||
|
|
||||||
|
$as =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByTagName(
|
||||||
|
"a"
|
||||||
|
);
|
||||||
|
|
||||||
|
if(count($as) !== 0){
|
||||||
|
|
||||||
|
$text =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$as[0]
|
||||||
|
);
|
||||||
|
|
||||||
|
// @TODO implement did_you_mean
|
||||||
|
$out["spelling"] = [
|
||||||
|
"type" => "including",
|
||||||
|
"using" => $search,
|
||||||
|
"correction" => $text
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$this->fuckhtml->load($result_div);
|
$this->fuckhtml->load($result_div);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -895,36 +932,10 @@ class google{
|
||||||
// get "Related Searches" and "People also search for"
|
// get "Related Searches" and "People also search for"
|
||||||
//
|
//
|
||||||
$relateds =
|
$relateds =
|
||||||
array_merge(
|
$this->fuckhtml
|
||||||
$this->fuckhtml
|
->getElementsByClassName(
|
||||||
->getElementsByClassName(
|
"wyccme",
|
||||||
$this->getstyle(
|
"div"
|
||||||
[
|
|
||||||
"align-items" => "center",
|
|
||||||
"background-color" => "#28292a",
|
|
||||||
"border-radius" => "100px",
|
|
||||||
"box-sizing" => "border-box",
|
|
||||||
"display" => "flex",
|
|
||||||
"max-height" => "none",
|
|
||||||
"min-height" => "48px",
|
|
||||||
"padding-left" => "17px",
|
|
||||||
"padding-right" => "17px",
|
|
||||||
"position" => "relative"
|
|
||||||
]
|
|
||||||
) . " " .
|
|
||||||
$this->getstyle(
|
|
||||||
[
|
|
||||||
"margin-left" => "8px",
|
|
||||||
"margin-right" => "8px"
|
|
||||||
]
|
|
||||||
),
|
|
||||||
"a"
|
|
||||||
),
|
|
||||||
$this->fuckhtml
|
|
||||||
->getElementsByClassName(
|
|
||||||
"wyccme",
|
|
||||||
"div"
|
|
||||||
)
|
|
||||||
);
|
);
|
||||||
|
|
||||||
foreach($relateds as $related){
|
foreach($relateds as $related){
|
||||||
|
@ -1354,7 +1365,7 @@ class google{
|
||||||
"font-size" => "12px",
|
"font-size" => "12px",
|
||||||
"line-height" => "1.34",
|
"line-height" => "1.34",
|
||||||
"display" => "inline-block",
|
"display" => "inline-block",
|
||||||
"font-family" => "Google Sans,arial,sans-serif",
|
"font-family" => "google sans,arial,sans-serif",
|
||||||
"padding-right" => "0",
|
"padding-right" => "0",
|
||||||
"white-space" => "nowrap"
|
"white-space" => "nowrap"
|
||||||
]
|
]
|
||||||
|
@ -1401,7 +1412,7 @@ class google{
|
||||||
"line-height" => "22px",
|
"line-height" => "22px",
|
||||||
"overflow" => "hidden",
|
"overflow" => "hidden",
|
||||||
"word-break" => "break-word",
|
"word-break" => "break-word",
|
||||||
"color" => "#bdc1c6"
|
"color" => "#4d5156"
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
"div"
|
"div"
|
||||||
|
@ -1415,12 +1426,9 @@ class google{
|
||||||
->getElementsByClassName(
|
->getElementsByClassName(
|
||||||
$this->getstyle(
|
$this->getstyle(
|
||||||
[
|
[
|
||||||
"border-radius" => "10px",
|
"background-color" => "rgba(0,0,0,0.6)",
|
||||||
"font-family" => "arial,sans-serif-medium,sans-serif",
|
"color" => "#fff",
|
||||||
"font-size" => "12px",
|
"fill" => "#fff"
|
||||||
"line-height" => "16px",
|
|
||||||
"padding-block" => "2px",
|
|
||||||
"padding-inline" => "8px"
|
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
"div"
|
"div"
|
||||||
|
@ -1433,14 +1441,6 @@ class google{
|
||||||
->getTextContent(
|
->getTextContent(
|
||||||
$duration[0]
|
$duration[0]
|
||||||
);
|
);
|
||||||
|
|
||||||
// remove duration from description
|
|
||||||
$description[0]["innerHTML"] =
|
|
||||||
str_replace(
|
|
||||||
$duration[0]["outerHTML"],
|
|
||||||
"",
|
|
||||||
$description[0]["innerHTML"]
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$web["description"] =
|
$web["description"] =
|
||||||
|
@ -1979,7 +1979,7 @@ class google{
|
||||||
"font-size" => "12px",
|
"font-size" => "12px",
|
||||||
"line-height" => "1.34",
|
"line-height" => "1.34",
|
||||||
"display" => "inline-block",
|
"display" => "inline-block",
|
||||||
"font-family" => "Google Sans,arial,sans-serif",
|
"font-family" => "google sans,arial,sans-serif",
|
||||||
"padding-right" => "0",
|
"padding-right" => "0",
|
||||||
"white-space" => "nowrap"
|
"white-space" => "nowrap"
|
||||||
]
|
]
|
||||||
|
@ -2211,7 +2211,7 @@ class google{
|
||||||
->getElementsByClassName(
|
->getElementsByClassName(
|
||||||
$this->getstyle(
|
$this->getstyle(
|
||||||
[
|
[
|
||||||
"font-family" => "Google Sans,arial,sans-serif",
|
"font-family" => "google sans,arial,sans-serif",
|
||||||
"font-size" => "28px",
|
"font-size" => "28px",
|
||||||
"line-height" => "36px"
|
"line-height" => "36px"
|
||||||
]
|
]
|
||||||
|
@ -2801,7 +2801,22 @@ class google{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// get thumbnail
|
// get heading element
|
||||||
|
$heading =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByAttributeValue(
|
||||||
|
"role",
|
||||||
|
"heading",
|
||||||
|
"div"
|
||||||
|
);
|
||||||
|
|
||||||
|
if(count($heading) === 0){
|
||||||
|
|
||||||
|
// no heading, fuck this.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// get thumbnail before loading heading object
|
||||||
$image =
|
$image =
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
->getElementsByAttributeName(
|
->getElementsByAttributeName(
|
||||||
|
@ -2823,35 +2838,6 @@ class google{
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
// get title
|
|
||||||
$title =
|
|
||||||
$this->fuckhtml
|
|
||||||
->getElementsByClassName(
|
|
||||||
$this->getstyle(
|
|
||||||
[
|
|
||||||
"font-family" => "arial,sans-serif",
|
|
||||||
"font-size" => "16px",
|
|
||||||
"font-weight" => "400",
|
|
||||||
"line-height" => "24px"
|
|
||||||
]
|
|
||||||
),
|
|
||||||
"div"
|
|
||||||
);
|
|
||||||
|
|
||||||
if(count($title) === 0){
|
|
||||||
|
|
||||||
// ?? no title
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$title =
|
|
||||||
$this->titledots(
|
|
||||||
$this->fuckhtml
|
|
||||||
->getTextContent(
|
|
||||||
$title[0]
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
// get duration
|
// get duration
|
||||||
$duration_div =
|
$duration_div =
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
|
@ -2908,6 +2894,38 @@ class google{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// load heading
|
||||||
|
$this->fuckhtml->load($heading[0]);
|
||||||
|
|
||||||
|
// get title
|
||||||
|
$title =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByClassName(
|
||||||
|
$this->getstyle(
|
||||||
|
[
|
||||||
|
"font-family" => "arial,sans-serif",
|
||||||
|
"font-size" => "16px",
|
||||||
|
"font-weight" => "400",
|
||||||
|
"line-height" => "24px"
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"div"
|
||||||
|
);
|
||||||
|
|
||||||
|
if(count($title) === 0){
|
||||||
|
|
||||||
|
// ?? no title
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$title =
|
||||||
|
$this->titledots(
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$title[0]
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
// get date
|
// get date
|
||||||
$date_div =
|
$date_div =
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
|
@ -3940,7 +3958,7 @@ class google{
|
||||||
for($k=0; $k<count($values_regex[1]); $k++){
|
for($k=0; $k<count($values_regex[1]); $k++){
|
||||||
|
|
||||||
$values[trim($values_regex[1][$k])] =
|
$values[trim($values_regex[1][$k])] =
|
||||||
trim($values_regex[2][$k]);
|
strtolower(trim($values_regex[2][$k]));
|
||||||
}
|
}
|
||||||
|
|
||||||
$names = explode(",", $matches[1][$i]);
|
$names = explode(",", $matches[1][$i]);
|
||||||
|
@ -3971,7 +3989,7 @@ class google{
|
||||||
|
|
||||||
foreach($this->styles[":root"] as $key => $value){
|
foreach($this->styles[":root"] as $key => $value){
|
||||||
|
|
||||||
$this->css_colors[$value] = $key;
|
$this->css_colors[$value] = strtolower($key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4206,7 +4224,7 @@ class google{
|
||||||
throw new Exception("Failed to get HTML");
|
throw new Exception("Failed to get HTML");
|
||||||
}
|
}
|
||||||
|
|
||||||
//$html = file_get_contents("scraper/google-video.html");
|
//$html = file_get_contents("scraper/google.html");
|
||||||
|
|
||||||
$response = $this->parsepage($html, "videos", $search, $proxy, $params);
|
$response = $this->parsepage($html, "videos", $search, $proxy, $params);
|
||||||
$out = [
|
$out = [
|
||||||
|
|
110
scraper/sc.php
110
scraper/sc.php
|
@ -6,6 +6,9 @@ class sc{
|
||||||
|
|
||||||
include "lib/backend.php";
|
include "lib/backend.php";
|
||||||
$this->backend = new backend("sc");
|
$this->backend = new backend("sc");
|
||||||
|
|
||||||
|
include "lib/fuckhtml.php";
|
||||||
|
$this->fuckhtml = new fuckhtml();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getfilters($page){
|
public function getfilters($page){
|
||||||
|
@ -25,7 +28,7 @@ class sc{
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
private function get($proxy, $url, $get = []){
|
private function get($proxy, $url, $get = [], $web_req = false){
|
||||||
|
|
||||||
$curlproc = curl_init();
|
$curlproc = curl_init();
|
||||||
|
|
||||||
|
@ -37,19 +40,43 @@ class sc{
|
||||||
curl_setopt($curlproc, CURLOPT_URL, $url);
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
|
||||||
["User-Agent: " . config::USER_AGENT,
|
// use http2
|
||||||
"Accept: application/json, text/javascript, */*; q=0.01",
|
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
||||||
"Accept-Language: en-US,en;q=0.5",
|
|
||||||
"Accept-Encoding: gzip",
|
if($web_req === false){
|
||||||
"Referer: https://soundcloud.com/",
|
|
||||||
"Origin: https://soundcloud.com",
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||||
"DNT: 1",
|
["User-Agent: " . config::USER_AGENT,
|
||||||
"Connection: keep-alive",
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
"Sec-Fetch-Dest: empty",
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
"Sec-Fetch-Mode: cors",
|
"Accept-Encoding: gzip",
|
||||||
"Sec-Fetch-Site: same-site"]
|
"Referer: https://soundcloud.com/",
|
||||||
);
|
"Origin: https://soundcloud.com",
|
||||||
|
"DNT: 1",
|
||||||
|
"Connection: keep-alive",
|
||||||
|
"Sec-Fetch-Dest: empty",
|
||||||
|
"Sec-Fetch-Mode: cors",
|
||||||
|
"Sec-Fetch-Site: same-site",
|
||||||
|
"Priority: u=1"]
|
||||||
|
);
|
||||||
|
}else{
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||||
|
["User-Agent: " . config::USER_AGENT,
|
||||||
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
|
"Accept-Encoding: gzip",
|
||||||
|
"DNT: 1",
|
||||||
|
"Connection: keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests: 1",
|
||||||
|
"Sec-Fetch-Dest: document",
|
||||||
|
"Sec-Fetch-Mode: navigate",
|
||||||
|
"Sec-Fetch-Site: cross-site",
|
||||||
|
"Priority: u=1",
|
||||||
|
"TE: trailers"]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
||||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
||||||
|
@ -300,9 +327,12 @@ class sc{
|
||||||
$description[] = $song["title"];
|
$description[] = $song["title"];
|
||||||
}
|
}
|
||||||
|
|
||||||
if(count($description) != 0){
|
if(count($description) !== 0){
|
||||||
|
|
||||||
$description = trim($count . " songs. " . implode(", ", $description));
|
$description = trim($count . " songs. " . implode(", ", $description));
|
||||||
|
}else{
|
||||||
|
|
||||||
|
$description = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
if(
|
if(
|
||||||
|
@ -396,13 +426,48 @@ class sc{
|
||||||
|
|
||||||
$token = apcu_fetch("sc_token");
|
$token = apcu_fetch("sc_token");
|
||||||
|
|
||||||
if($token === false){
|
if($token !== false){
|
||||||
|
|
||||||
|
return $token;
|
||||||
|
}
|
||||||
|
|
||||||
|
// search through all javascript components on the main page
|
||||||
|
try{
|
||||||
|
$html =
|
||||||
|
$this->get(
|
||||||
|
$proxy,
|
||||||
|
"https://soundcloud.com",
|
||||||
|
[],
|
||||||
|
true
|
||||||
|
);
|
||||||
|
}catch(Exception $error){
|
||||||
|
|
||||||
|
throw new Exception("Failed to fetch front page");
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->fuckhtml->load($html);
|
||||||
|
|
||||||
|
$scripts =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByTagName(
|
||||||
|
"script"
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach($scripts as $script){
|
||||||
|
|
||||||
|
if(
|
||||||
|
!isset($script["attributes"]["src"]) ||
|
||||||
|
strpos($script["attributes"]["src"], "sndcdn.com") === false
|
||||||
|
){
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
try{
|
try{
|
||||||
$js =
|
$js =
|
||||||
$this->get(
|
$this->get(
|
||||||
$proxy,
|
$proxy,
|
||||||
"https://a-v2.sndcdn.com/assets/0-a901c1e0.js",
|
$script["attributes"]["src"],
|
||||||
[]
|
[]
|
||||||
);
|
);
|
||||||
}catch(Exception $error){
|
}catch(Exception $error){
|
||||||
|
@ -416,16 +481,15 @@ class sc{
|
||||||
$token
|
$token
|
||||||
);
|
);
|
||||||
|
|
||||||
if(!isset($token[1])){
|
if(isset($token[1])){
|
||||||
|
|
||||||
throw new Exception("Failed to get search token");
|
apcu_store("sc_token", $token[1]);
|
||||||
|
return $token[1];
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
apcu_store("sc_token", $token[1]);
|
|
||||||
return $token[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return $token;
|
throw new Exception("Did not find a Soundcloud token in the Javascript blobs");
|
||||||
}
|
}
|
||||||
|
|
||||||
private function limitstrlen($text){
|
private function limitstrlen($text){
|
||||||
|
|
|
@ -252,21 +252,30 @@ class yep{
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_URL, $url);
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||||||
|
|
||||||
|
// use http2
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
||||||
|
|
||||||
|
// set ciphers
|
||||||
|
curl_setopt(
|
||||||
|
$curlproc,
|
||||||
|
CURLOPT_SSL_CIPHER_LIST,
|
||||||
|
"aes_128_gcm_sha_256,chacha20_poly1305_sha_256,aes_256_gcm_sha_384,ecdhe_ecdsa_aes_128_gcm_sha_256,ecdhe_rsa_aes_128_gcm_sha_256,ecdhe_ecdsa_chacha20_poly1305_sha_256,ecdhe_rsa_chacha20_poly1305_sha_256,ecdhe_ecdsa_aes_256_gcm_sha_384,ecdhe_rsa_aes_256_gcm_sha_384,ecdhe_ecdsa_aes_256_sha,ecdhe_ecdsa_aes_128_sha,ecdhe_rsa_aes_128_sha,ecdhe_rsa_aes_256_sha,rsa_aes_128_gcm_sha_256,rsa_aes_256_gcm_sha_384,rsa_aes_128_sha,rsa_aes_256_sha"
|
||||||
|
);
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||||
["User-Agent: " . config::USER_AGENT,
|
["User-Agent: " . config::USER_AGENT,
|
||||||
"Accept: */*",
|
"Accept: */*",
|
||||||
"Accept-Language: en-US,en;q=0.5",
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
"Accept-Encoding: gzip, deflate, br, zstd",
|
"Accept-Encoding: gzip, deflate, br, zstd",
|
||||||
"Connection: keep-alive",
|
|
||||||
"DNT: 1",
|
|
||||||
"Priority: u=1",
|
|
||||||
"Origin: https://yep.com",
|
|
||||||
"Referer: https://yep.com/",
|
"Referer: https://yep.com/",
|
||||||
|
"Origin: https://yep.com",
|
||||||
|
"DNT: 1",
|
||||||
"Connection: keep-alive",
|
"Connection: keep-alive",
|
||||||
"Sec-Fetch-Dest: empty",
|
"Sec-Fetch-Dest: empty",
|
||||||
"Sec-Fetch-Mode: cors",
|
"Sec-Fetch-Mode: cors",
|
||||||
"Sec-Fetch-Site: same-site",
|
"Sec-Fetch-Site: same-site",
|
||||||
|
"Priority: u=4",
|
||||||
"TE: trailers"]
|
"TE: trailers"]
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue