Use a better robots.txt to prevent information leakage (#33)

This commit is contained in:
Nikita Karamov 2023-06-17 14:13:18 +02:00 committed by GitHub
commit 7bd29fd3cb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 38 additions and 5 deletions

View file

@ -11,17 +11,17 @@ repos:
- id: check-toml
- id: check-yaml
- repo: https://github.com/pre-commit/mirrors-prettier
rev: "v2.7.1"
rev: "v3.0.0-alpha.9-for-vscode"
hooks:
- id: prettier
additional_dependencies:
- prettier@2
- repo: https://github.com/pre-commit/mirrors-eslint
rev: "v8.35.0"
rev: "v8.43.0"
hooks:
- id: eslint
additional_dependencies:
- eslint
- eslint-config-prettier
- eslint-plugin-unicorn
- prettier
- prettier@2

View file

@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Fixed
- blocked crawling of all pages other than index (#33)
## [2.4.4] - 2023-03-01
### Fixed

View file

@ -43,7 +43,7 @@ function normalizeUrl(url) {
if (!url.includes("http://") && !url.includes("https://")) {
url = "https://" + url;
}
if (url.charAt(url.length - 1) !== "/") {
if (url.at(-1) !== "/") {
url = url + "/";
}
return url;

View file

@ -1,2 +1,29 @@
User-agent: *
Allow: /
Allow: /$
Disallow: /
# Following borrowed from https://seirdy.one/robots.txt
# I opt out of online advertising so malware that injects ads on my site won't get paid.
# You should do the same.
User-Agent: Adsbot
Disallow: /
# > This robot collects content from the Internet for the sole purpose of # helping educational institutions prevent plagiarism. [...] we compare student papers against the content we find on the Internet to see if we # can find similarities. (http://www.turnitin.com/robot/crawlerinfo.html)
User-Agent: TurnitinBot
Disallow: /
# > NameProtect engages in crawling activity in search of a wide range of brand and other intellectual property violations that may be of interest to our clients. (http://www.nameprotect.com/botinfo.html)
User-Agent: NPBot
Disallow: /
# iThenticate is a new service we have developed to combat the piracy of intellectual property and ensure the originality of written work for# publishers, non-profit agencies, corporations, and newspapers. (http://www.slysearch.com/)
User-Agent: SlySearch
Disallow: /
# BLEXBot assists internet marketers to get information on the link structure of sites and their interlinking on the web, to avoid any technical and possible legal issues and improve overall online experience. (http://webmeup-crawler.com/)
User-Agent: BLEXBot
Disallow: /
User-agent: ChatGPT-User
Disallow: /