mirror of
https://codeberg.org/forgejo/forgejo.git
synced 2024-12-22 15:23:14 -05:00
5b3a82d621
- The ambiguous character detection is an important security feature to combat against sourcebase attacks (https://trojansource.codes/). - However there are a few problems with the feature as it stands today (i) it's apparantly an big performance hitter, it's twice as slow as syntax highlighting (ii) it contains false positives, because it's reporting valid problems but not valid within the context of a programming language (ambiguous charachters in code comments being a prime example) that can lead to security issues (iii) charachters from certain languages always being marked as ambiguous. It's a lot of effort to fix the aforementioned issues. - Therefore, make it configurable in which context the ambiguous character detection should be run, this avoids running detection in all contexts such as file views, but still enable it in commits and pull requests diffs where it matters the most. Ideally this also becomes an per-repository setting, but the code architecture doesn't allow for a clean implementation of that. - Adds unit test. - Adds integration tests to ensure that the contexts and instance-wide is respected (and that ambigious charachter detection actually work in different places). - Ref: https://codeberg.org/forgejo/forgejo/pulls/2395#issuecomment-1575547 - Ref: https://codeberg.org/forgejo/forgejo/issues/564
58 lines
2.1 KiB
Go
58 lines
2.1 KiB
Go
// Copyright 2022 The Gitea Authors. All rights reserved.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
//go:generate go run invisible/generate.go -v -o ./invisible_gen.go
|
|
|
|
//go:generate go run ambiguous/generate.go -v -o ./ambiguous_gen.go ambiguous/ambiguous.json
|
|
|
|
package charset
|
|
|
|
import (
|
|
"html/template"
|
|
"io"
|
|
"slices"
|
|
"strings"
|
|
|
|
"code.gitea.io/gitea/modules/log"
|
|
"code.gitea.io/gitea/modules/setting"
|
|
"code.gitea.io/gitea/modules/translation"
|
|
)
|
|
|
|
// RuneNBSP is the codepoint for NBSP
|
|
const RuneNBSP = 0xa0
|
|
|
|
type escapeContext string
|
|
|
|
// Keep this consistent with the documentation of [ui].SKIP_ESCAPE_CONTEXTS
|
|
// Defines the different contexts that could be used to escape in.
|
|
const (
|
|
// Wiki pages.
|
|
WikiContext escapeContext = "wiki"
|
|
// Rendered content (except markup), source code and blames.
|
|
FileviewContext escapeContext = "file-view"
|
|
// Commits or pull requet's diff.
|
|
DiffContext escapeContext = "diff"
|
|
)
|
|
|
|
// EscapeControlHTML escapes the unicode control sequences in a provided html document
|
|
func EscapeControlHTML(html template.HTML, locale translation.Locale, context escapeContext, allowed ...rune) (escaped *EscapeStatus, output template.HTML) {
|
|
sb := &strings.Builder{}
|
|
escaped, _ = EscapeControlReader(strings.NewReader(string(html)), sb, locale, context, allowed...) // err has been handled in EscapeControlReader
|
|
return escaped, template.HTML(sb.String())
|
|
}
|
|
|
|
// EscapeControlReader escapes the unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus
|
|
func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, context escapeContext, allowed ...rune) (escaped *EscapeStatus, err error) {
|
|
if !setting.UI.AmbiguousUnicodeDetection || slices.Contains(setting.UI.SkipEscapeContexts, string(context)) {
|
|
_, err = io.Copy(writer, reader)
|
|
return &EscapeStatus{}, err
|
|
}
|
|
outputStream := &HTMLStreamerWriter{Writer: writer}
|
|
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
|
|
|
|
if err = StreamHTML(reader, streamer); err != nil {
|
|
streamer.escaped.HasError = true
|
|
log.Error("Error whilst escaping: %v", err)
|
|
}
|
|
return streamer.escaped, err
|
|
}
|