mirror of
https://git.lolcat.ca/lolcat/4get.git
synced 2024-12-24 23:56:34 -05:00
fag protection
This commit is contained in:
parent
81502d4721
commit
2976c0a6a4
11 changed files with 82 additions and 40 deletions
|
@ -23,17 +23,13 @@ class config{
|
||||||
// Enable the API?
|
// Enable the API?
|
||||||
const API_ENABLED = true;
|
const API_ENABLED = true;
|
||||||
|
|
||||||
// Bot protection
|
//
|
||||||
// 4get.ca has been hit with 500k bot reqs every single day for months
|
// BOT PROTECTION
|
||||||
// you probably want to enable this if your instance is public...
|
//
|
||||||
// 0 = disabled
|
|
||||||
// 1 = ask for image captcha (requires imagemagick v6 or higher)
|
|
||||||
// @TODO: 2 = invite only (users needs a pass)
|
|
||||||
const BOT_PROTECTION = 0;
|
|
||||||
|
|
||||||
// Maximal number of searches per captcha key/pass issued. Counter gets
|
// 0 = disabled, 1 = ask for image captcha, @TODO: 2 = invite only (users needs a pass)
|
||||||
// reset on every APCU cache clear (should happen once a day)
|
// VERY useful against a targetted attack
|
||||||
const MAX_SEARCHES = 100;
|
const BOT_PROTECTION = 0;
|
||||||
|
|
||||||
// if BOT_PROTECTION is set to 1, specify the available datasets here
|
// if BOT_PROTECTION is set to 1, specify the available datasets here
|
||||||
// images should be named from 1.png to X.png, and be 100x100 in size
|
// images should be named from 1.png to X.png, and be 100x100 in size
|
||||||
|
@ -45,6 +41,32 @@ class config{
|
||||||
// ["minecraft", 848]
|
// ["minecraft", 848]
|
||||||
];
|
];
|
||||||
|
|
||||||
|
// If this regex expression matches on the user agent, it blocks the request
|
||||||
|
// Not useful at all against a targetted attack
|
||||||
|
const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider/i';
|
||||||
|
|
||||||
|
// Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!)
|
||||||
|
// Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"];
|
||||||
|
// Useful for blocking *some* proxies used for botting
|
||||||
|
const FILTERED_HEADER_KEYS = [
|
||||||
|
"x-forwarded-for",
|
||||||
|
"x-via",
|
||||||
|
"forwarded-for",
|
||||||
|
"via"
|
||||||
|
];
|
||||||
|
|
||||||
|
// @TODO: Portscan the user for open proxies before allowing a connection, block user if any are found
|
||||||
|
// Requires the nmap package
|
||||||
|
const NMAP_PROXY_CHECK = false;
|
||||||
|
|
||||||
|
// @TODO: Make IP blacklist public under /api/v1/blacklist endpoint ?
|
||||||
|
const PUBLIC_IP_BLACKLIST = true;
|
||||||
|
|
||||||
|
// Maximal number of searches per captcha key/pass issued. Counter gets
|
||||||
|
// reset on every APCU cache clear (should happen once a day).
|
||||||
|
// Only useful when BOT_PROTECTION is NOT set to 0
|
||||||
|
const MAX_SEARCHES = 100;
|
||||||
|
|
||||||
// List of domains that point to your servers. Include your tor/i2p
|
// List of domains that point to your servers. Include your tor/i2p
|
||||||
// addresses here! Must be a valid URL. Won't affect links placed on
|
// addresses here! Must be a valid URL. Won't affect links placed on
|
||||||
// the homepage.
|
// the homepage.
|
||||||
|
|
|
@ -8,6 +8,9 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel
|
||||||
3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
|
3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
|
||||||
4. The captcha font is located in `data/fonts/captcha.ttf`
|
4. The captcha font is located in `data/fonts/captcha.ttf`
|
||||||
|
|
||||||
|
## Robots.txt
|
||||||
|
Make sure you configure this right to optimize your search engine presence! Head over to `/robots.txt` and change the 4get.ca domain to your own domain.
|
||||||
|
|
||||||
## Server listing
|
## Server listing
|
||||||
To be listed on https://4get.ca/instances , you must contact *any* of the people in the server list and ask them to add you to their list of instances in their configuration. The instance list is distributed, and I don't have control over it.
|
To be listed on https://4get.ca/instances , you must contact *any* of the people in the server list and ask them to add you to their list of instances in their configuration. The instance list is distributed, and I don't have control over it.
|
||||||
|
|
||||||
|
@ -32,4 +35,4 @@ If you see spammy entries in your instances list, simply remove the instance fro
|
||||||
Done! The scraper you chose should now be using the rotating proxies. When asking for the next page of results, it will use the same proxy to avoid detection!
|
Done! The scraper you chose should now be using the rotating proxies. When asking for the next page of results, it will use the same proxy to avoid detection!
|
||||||
|
|
||||||
### Important!
|
### Important!
|
||||||
If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead.
|
If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead. Hopefully this tip can save you 3 hours of your life!
|
||||||
|
|
|
@ -29,7 +29,7 @@ try{
|
||||||
|
|
||||||
}catch(Exception $error){
|
}catch(Exception $error){
|
||||||
|
|
||||||
$frontend->drawscrapererror($error->getMessage(), $get, "images");
|
$frontend->drawscrapererror($error->getMessage(), $get, "images", $payload["timetaken"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(count($results["image"]) === 0){
|
if(count($results["image"]) === 0){
|
||||||
|
|
|
@ -32,6 +32,8 @@ class backend{
|
||||||
|
|
||||||
$proxylist = array_values($proxylist);
|
$proxylist = array_values($proxylist);
|
||||||
|
|
||||||
|
echo $proxy_index_raw % count($proxylist);
|
||||||
|
|
||||||
return $proxylist[$proxy_index_raw % count($proxylist)];
|
return $proxylist[$proxy_index_raw % count($proxylist)];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -44,7 +44,7 @@ class frontend{
|
||||||
$replacements["timetaken"] !== null
|
$replacements["timetaken"] !== null
|
||||||
){
|
){
|
||||||
|
|
||||||
$replacements["timetaken"] = '<div class="timetaken">Took ' . substr(microtime(true) - $replacements["timetaken"], 0, 4) . 's</div>';
|
$replacements["timetaken"] = '<div class="timetaken">Took ' . number_format(microtime(true) - $replacements["timetaken"], 2) . 's</div>';
|
||||||
}
|
}
|
||||||
|
|
||||||
$handle = fopen("template/{$template}", "r");
|
$handle = fopen("template/{$template}", "r");
|
||||||
|
@ -84,29 +84,54 @@ class frontend{
|
||||||
"filters" => $this->generatehtmlfilters($filters, $get)
|
"filters" => $this->generatehtmlfilters($filters, $get)
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
$headers_raw = getallheaders();
|
||||||
|
$header_keys = [];
|
||||||
|
$user_agent = "";
|
||||||
|
$bad_header = false;
|
||||||
|
|
||||||
|
foreach($headers_raw as $headerkey => $headervalue){
|
||||||
|
|
||||||
|
$headerkey = strtolower($headerkey);
|
||||||
|
if($headerkey == "user-agent"){
|
||||||
|
|
||||||
|
$user_agent = $headervalue;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check header key
|
||||||
|
if(in_array($headerkey, config::FILTERED_HEADER_KEYS)){
|
||||||
|
|
||||||
|
$bad_header = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if(
|
if(
|
||||||
preg_match(
|
preg_match(
|
||||||
'/bot|wget|curl|python-requests|scrapy|feedfetcher|go-http-client|ruby|universalfeedparser|yahoo\! slurp|spider|rss/i',
|
config::HEADER_REGEX,
|
||||||
$_SERVER["HTTP_USER_AGENT"]
|
$user_agent
|
||||||
)
|
) ||
|
||||||
|
$bad_header === true
|
||||||
){
|
){
|
||||||
|
|
||||||
// bot detected !!
|
// bot detected !!
|
||||||
apcu_inc("captcha_gen");
|
apcu_inc("captcha_gen");
|
||||||
|
|
||||||
|
$null = null;
|
||||||
$this->drawerror(
|
$this->drawerror(
|
||||||
"Tshh, blocked!",
|
"Tshh, blocked!",
|
||||||
'You were blocked from viewing this page. If you wish to scrape data from 4get, please consider running <a href="https://git.lolcat.ca/lolcat/4get" rel="noreferrer nofollow">your own 4get instance</a>.',
|
'Your browser, IP or IP range has been blocked from this 4get instance. If this is an error, please <a href="/about">contact the administrator</a>.',
|
||||||
|
microtime(true)
|
||||||
);
|
);
|
||||||
die();
|
die();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public function drawerror($title, $error){
|
public function drawerror($title, $error, $timetaken){
|
||||||
|
|
||||||
echo
|
echo
|
||||||
$this->load("search.html", [
|
$this->load("search.html", [
|
||||||
"timetaken" => null,
|
"timetaken" => $timetaken,
|
||||||
"class" => "",
|
"class" => "",
|
||||||
"right-left" => "",
|
"right-left" => "",
|
||||||
"right-right" => "",
|
"right-right" => "",
|
||||||
|
@ -119,7 +144,7 @@ class frontend{
|
||||||
die();
|
die();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function drawscrapererror($error, $get, $target){
|
public function drawscrapererror($error, $get, $target, $timetaken){
|
||||||
|
|
||||||
$this->drawerror(
|
$this->drawerror(
|
||||||
"Shit",
|
"Shit",
|
||||||
|
@ -131,7 +156,8 @@ class frontend{
|
||||||
'<li>Remove keywords that could cause errors</li>' .
|
'<li>Remove keywords that could cause errors</li>' .
|
||||||
'<li><a href="/instances?target=' . $target . "&" . $this->buildquery($get, false) . '">Try your search on another 4get instance</a></li>' .
|
'<li><a href="/instances?target=' . $target . "&" . $this->buildquery($get, false) . '">Try your search on another 4get instance</a></li>' .
|
||||||
'</ul><br>' .
|
'</ul><br>' .
|
||||||
'If the error persists, please <a href="/about">contact the administrator</a>.'
|
'If the error persists, please <a href="/about">contact the administrator</a>.',
|
||||||
|
$timetaken
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -483,10 +509,6 @@ class frontend{
|
||||||
$archives[] = "warosu.org";
|
$archives[] = "warosu.org";
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "cm":
|
|
||||||
$archives[] = "boards.fireden.net";
|
|
||||||
break;
|
|
||||||
|
|
||||||
case "f":
|
case "f":
|
||||||
$archives[] = "archive.4plebs.org";
|
$archives[] = "archive.4plebs.org";
|
||||||
break;
|
break;
|
||||||
|
@ -503,12 +525,10 @@ class frontend{
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "v":
|
case "v":
|
||||||
$archives[] = "boards.fireden.net";
|
|
||||||
$archives[] = "arch.b4k.co";
|
$archives[] = "arch.b4k.co";
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "vg":
|
case "vg":
|
||||||
$archives[] = "boards.fireden.net";
|
|
||||||
$archives[] = "arch.b4k.co";
|
$archives[] = "arch.b4k.co";
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -579,7 +599,6 @@ class frontend{
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "sci":
|
case "sci":
|
||||||
$archives[] = "boards.fireden.net";
|
|
||||||
$archives[] = "warosu.org";
|
$archives[] = "warosu.org";
|
||||||
$archives[] = "eientei.xyz";
|
$archives[] = "eientei.xyz";
|
||||||
break;
|
break;
|
||||||
|
@ -614,7 +633,6 @@ class frontend{
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "ic":
|
case "ic":
|
||||||
$archives[] = "boards.fireden.net";
|
|
||||||
$archives[] = "warosu.org";
|
$archives[] = "warosu.org";
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -741,10 +759,6 @@ class frontend{
|
||||||
$archives[] = "desuarchive.org";
|
$archives[] = "desuarchive.org";
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "y":
|
|
||||||
$archives[] = "boards.fireden.net";
|
|
||||||
break;
|
|
||||||
|
|
||||||
case "t":
|
case "t":
|
||||||
$archives[] = "archiveofsins.com";
|
$archives[] = "archiveofsins.com";
|
||||||
break;
|
break;
|
||||||
|
@ -802,7 +816,7 @@ class frontend{
|
||||||
$payload .=
|
$payload .=
|
||||||
'<a href="https://webcache.googleusercontent.com/search?q=cache:' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://google.com" alt="go">Google cache</a>' .
|
'<a href="https://webcache.googleusercontent.com/search?q=cache:' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://google.com" alt="go">Google cache</a>' .
|
||||||
'<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' .
|
'<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' .
|
||||||
'<a href="https://archive.is/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
|
'<a href="https://archive.ph/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
|
||||||
'<a href="https://ghostarchive.org/search?term=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://ghostarchive.org" alt="gh">Ghostarchive</a>' .
|
'<a href="https://ghostarchive.org/search?term=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://ghostarchive.org" alt="gh">Ghostarchive</a>' .
|
||||||
'<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' .
|
'<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' .
|
||||||
'<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' .
|
'<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' .
|
||||||
|
|
|
@ -73,7 +73,7 @@ class fuckhtml{
|
||||||
$attributes = [];
|
$attributes = [];
|
||||||
|
|
||||||
preg_match_all(
|
preg_match_all(
|
||||||
'/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/',
|
'/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/i',
|
||||||
$starting_tags[2][$i][0],
|
$starting_tags[2][$i][0],
|
||||||
$regex_attributes
|
$regex_attributes
|
||||||
);
|
);
|
||||||
|
@ -88,7 +88,7 @@ class fuckhtml{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$attributes[$regex_attributes[1][$k]] =
|
$attributes[strtolower($regex_attributes[1][$k])] =
|
||||||
trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00");
|
trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ try{
|
||||||
|
|
||||||
}catch(Exception $error){
|
}catch(Exception $error){
|
||||||
|
|
||||||
$frontend->drawscrapererror($error->getMessage(), $get, "music");
|
$frontend->drawscrapererror($error->getMessage(), $get, "music", $payload["timetaken"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
$categories = [
|
$categories = [
|
||||||
|
|
2
news.php
2
news.php
|
@ -31,7 +31,7 @@ try{
|
||||||
|
|
||||||
}catch(Exception $error){
|
}catch(Exception $error){
|
||||||
|
|
||||||
$frontend->drawscrapererror($error->getMessage(), $get, "news");
|
$frontend->drawscrapererror($error->getMessage(), $get, "news", $payload["timetaken"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -654,6 +654,7 @@ class google{
|
||||||
|
|
||||||
throw new Exception("Failed to get HTML");
|
throw new Exception("Failed to get HTML");
|
||||||
}
|
}
|
||||||
|
|
||||||
//$html = file_get_contents("scraper/google.html");
|
//$html = file_get_contents("scraper/google.html");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ try{
|
||||||
|
|
||||||
}catch(Exception $error){
|
}catch(Exception $error){
|
||||||
|
|
||||||
$frontend->drawscrapererror($error->getMessage(), $get, "videos");
|
$frontend->drawscrapererror($error->getMessage(), $get, "videos", $payload["timetaken"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
$categories = [
|
$categories = [
|
||||||
|
|
2
web.php
2
web.php
|
@ -31,7 +31,7 @@ try{
|
||||||
|
|
||||||
}catch(Exception $error){
|
}catch(Exception $error){
|
||||||
|
|
||||||
$frontend->drawscrapererror($error->getMessage(), $get, "web");
|
$frontend->drawscrapererror($error->getMessage(), $get, "web", $payload["timetaken"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Reference in a new issue