mirror of
https://git.lolcat.ca/lolcat/4get.git
synced 2024-12-24 23:56:34 -05:00
duckduckgo scraper rewrite
This commit is contained in:
parent
0b3bbe0f15
commit
774f7113df
6 changed files with 1488 additions and 2234 deletions
21
api.txt
21
api.txt
|
@ -1,10 +1,17 @@
|
|||
__ __ __
|
||||
/ // / ____ ____ / /_
|
||||
/ // /_/ __ `/ _ \/ __/
|
||||
/__ __/ /_/ / __/ /_
|
||||
/_/ \__, /\___/\__/
|
||||
/____/
|
||||
|
||||
44
|
||||
4444444 44
|
||||
44444444 44444 444
|
||||
44444444 444444 444444444
|
||||
44444 44444444 444444444
|
||||
444444444 4444444
|
||||
4444444444 444444
|
||||
4444444444444
|
||||
444444444444444444
|
||||
444444444444444
|
||||
44444444
|
||||
4444
|
||||
44
|
||||
|
||||
+ Welcome to the 4get API documentation +
|
||||
|
||||
+ Terms of use
|
||||
|
|
|
@ -119,7 +119,7 @@ class config{
|
|||
|
||||
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
||||
// Changing this might break things.
|
||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0";
|
||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0";
|
||||
|
||||
// Proxy pool assignments for each scraper
|
||||
// false = Use server's raw IP
|
||||
|
|
|
@ -75,6 +75,7 @@ class backend{
|
|||
break;
|
||||
|
||||
case "socks5_hostname":
|
||||
case "socks5h":
|
||||
case "socks5a":
|
||||
curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME);
|
||||
curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port);
|
||||
|
|
|
@ -526,4 +526,85 @@ class fuckhtml{
|
|||
$string
|
||||
);
|
||||
}
|
||||
|
||||
public function extract_json($json){
|
||||
|
||||
$len = strlen($json);
|
||||
$array_level = 0;
|
||||
$object_level = 0;
|
||||
$in_quote = null;
|
||||
$start = null;
|
||||
|
||||
for($i=0; $i<$len; $i++){
|
||||
|
||||
switch($json[$i]){
|
||||
|
||||
case "[":
|
||||
if($in_quote === null){
|
||||
|
||||
$array_level++;
|
||||
if($start === null){
|
||||
|
||||
$start = $i;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case "]":
|
||||
if($in_quote === null){
|
||||
|
||||
$array_level--;
|
||||
}
|
||||
break;
|
||||
|
||||
case "{":
|
||||
if($in_quote === null){
|
||||
|
||||
$object_level++;
|
||||
if($start === null){
|
||||
|
||||
$start = $i;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case "}":
|
||||
if($in_quote === null){
|
||||
|
||||
$object_level--;
|
||||
}
|
||||
break;
|
||||
|
||||
case "\"":
|
||||
case "'":
|
||||
if(
|
||||
$i !== 0 &&
|
||||
$json[$i - 1] !== "\\"
|
||||
){
|
||||
// found a non-escaped quote
|
||||
|
||||
if($in_quote === null){
|
||||
|
||||
// open quote
|
||||
$in_quote = $json[$i];
|
||||
}elseif($in_quote === $json[$i]){
|
||||
|
||||
// close quote
|
||||
$in_quote = null;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if(
|
||||
$start !== null &&
|
||||
$array_level === 0 &&
|
||||
$object_level === 0
|
||||
){
|
||||
|
||||
return substr($json, $start, $i - $start + 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
3616
scraper/ddg.php
3616
scraper/ddg.php
File diff suppressed because it is too large
Load diff
1
scraper/out.txt
Normal file
1
scraper/out.txt
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue