From 9c0d08eb7d3971eaec95dd3eb75bbbcbfce1dfef Mon Sep 17 00:00:00 2001 From: Nikita Karamov Date: Sat, 17 Jun 2023 14:00:32 +0200 Subject: [PATCH 1/4] Use a better robots.txt to prevent information leakage --- src/static/robots.txt | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/src/static/robots.txt b/src/static/robots.txt index c2a49f4..ac2ea8f 100644 --- a/src/static/robots.txt +++ b/src/static/robots.txt @@ -1,2 +1,29 @@ User-agent: * -Allow: / +Allow: /$ +Disallow: / + +# Following borrowed from https://seirdy.one/robots.txt + +# I opt out of online advertising so malware that injects ads on my site won't get paid. +# You should do the same. +User-Agent: Adsbot +Disallow: / + +# > This robot collects content from the Internet for the sole purpose of # helping educational institutions prevent plagiarism. [...] we compare student papers against the content we find on the Internet to see if we # can find similarities. (http://www.turnitin.com/robot/crawlerinfo.html) +User-Agent: TurnitinBot +Disallow: / + +# > NameProtect engages in crawling activity in search of a wide range of brand and other intellectual property violations that may be of interest to our clients. (http://www.nameprotect.com/botinfo.html) +User-Agent: NPBot +Disallow: / + +# iThenticate is a new service we have developed to combat the piracy of intellectual property and ensure the originality of written work for# publishers, non-profit agencies, corporations, and newspapers. (http://www.slysearch.com/) +User-Agent: SlySearch +Disallow: / + +# BLEXBot assists internet marketers to get information on the link structure of sites and their interlinking on the web, to avoid any technical and possible legal issues and improve overall online experience. (http://webmeup-crawler.com/) +User-Agent: BLEXBot +Disallow: / + +User-agent: ChatGPT-User +Disallow: / From c23921d0ff34ddf9af7e63d91681fd8f70c1e889 Mon Sep 17 00:00:00 2001 From: Nikita Karamov Date: Sat, 17 Jun 2023 14:07:34 +0200 Subject: [PATCH 2/4] Update pre-commit --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c66f9c..6166ee4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,17 +11,17 @@ repos: - id: check-toml - id: check-yaml - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v2.7.1" + rev: "v3.0.0-alpha.9-for-vscode" hooks: - id: prettier additional_dependencies: - prettier@2 - repo: https://github.com/pre-commit/mirrors-eslint - rev: "v8.35.0" + rev: "v8.43.0" hooks: - id: eslint additional_dependencies: - eslint - eslint-config-prettier - eslint-plugin-unicorn - - prettier + - prettier@2 From a040b32405c732c1dc6d7da9bca59cec5d1885d0 Mon Sep 17 00:00:00 2001 From: Nikita Karamov Date: Sat, 17 Jun 2023 14:08:15 +0200 Subject: [PATCH 3/4] Use at() instead of charAt() --- src/main.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.js b/src/main.js index 0d24963..c4cbdd1 100644 --- a/src/main.js +++ b/src/main.js @@ -43,7 +43,7 @@ function normalizeUrl(url) { if (!url.includes("http://") && !url.includes("https://")) { url = "https://" + url; } - if (url.charAt(url.length - 1) !== "/") { + if (url.at(-1) !== "/") { url = url + "/"; } return url; From a41fe8ba9ebd29700fb49586471fad6e80b870db Mon Sep 17 00:00:00 2001 From: Nikita Karamov Date: Sat, 17 Jun 2023 14:10:33 +0200 Subject: [PATCH 4/4] Update changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 010975a..756d3ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed + +- blocked crawling of all pages other than index (#33) + ## [2.4.4] - 2023-03-01 ### Fixed