2021-09-16 13:50:32 -05:00
// Copyright 2015 Matthew Holt and The Caddy Authors
2019-08-09 13:05:47 -05:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package caddyfile
import (
"bufio"
2020-07-20 14:55:51 -05:00
"bytes"
2023-02-25 19:34:27 -05:00
"fmt"
2019-08-09 13:05:47 -05:00
"io"
2023-02-25 19:34:27 -05:00
"regexp"
"strings"
2019-08-09 13:05:47 -05:00
"unicode"
)
type (
// lexer is a utility which can get values, token by
// token, from a Reader. A token is a word, and tokens
// are separated by whitespace. A word can be enclosed
// in quotes if it contains whitespace.
lexer struct {
2019-09-28 22:18:36 -05:00
reader * bufio . Reader
token Token
line int
skippedLines int
2019-08-09 13:05:47 -05:00
}
// Token represents a single parsable unit.
Token struct {
2023-02-25 19:34:27 -05:00
File string
2023-05-25 14:05:00 -05:00
imports [ ] string
2023-02-25 19:34:27 -05:00
Line int
Text string
wasQuoted rune // enclosing quote character, if any
heredocMarker string
snippetName string
2019-08-09 13:05:47 -05:00
}
)
2023-02-25 19:34:27 -05:00
// Tokenize takes bytes as input and lexes it into
// a list of tokens that can be parsed as a Caddyfile.
// Also takes a filename to fill the token's File as
// the source of the tokens, which is important to
// determine relative paths for `import` directives.
func Tokenize ( input [ ] byte , filename string ) ( [ ] Token , error ) {
l := lexer { }
if err := l . load ( bytes . NewReader ( input ) ) ; err != nil {
return nil , err
2023-02-16 19:08:36 -05:00
}
2023-02-25 19:34:27 -05:00
var tokens [ ] Token
for {
found , err := l . next ( )
if err != nil {
return nil , err
}
if ! found {
break
}
l . token . File = filename
tokens = append ( tokens , l . token )
2023-02-16 19:08:36 -05:00
}
2023-02-25 19:34:27 -05:00
return tokens , nil
2023-02-16 19:08:36 -05:00
}
2019-08-09 13:05:47 -05:00
// load prepares the lexer to scan an input for tokens.
// It discards any leading byte order mark.
func ( l * lexer ) load ( input io . Reader ) error {
l . reader = bufio . NewReader ( input )
l . line = 1
// discard byte order mark, if present
firstCh , _ , err := l . reader . ReadRune ( )
if err != nil {
return err
}
if firstCh != 0xFEFF {
err := l . reader . UnreadRune ( )
if err != nil {
return err
}
}
return nil
}
// next loads the next token into the lexer.
// A token is delimited by whitespace, unless
// the token starts with a quotes character (")
// in which case the token goes until the closing
// quotes (the enclosing quotes are not included).
// Inside quoted strings, quotes may be escaped
// with a preceding \ character. No other chars
// may be escaped. The rest of the line is skipped
// if a "#" character is read in. Returns true if
// a token was loaded; false otherwise.
2023-02-25 19:34:27 -05:00
func ( l * lexer ) next ( ) ( bool , error ) {
2019-08-09 13:05:47 -05:00
var val [ ] rune
2023-02-25 19:34:27 -05:00
var comment , quoted , btQuoted , inHeredoc , heredocEscaped , escaped bool
var heredocMarker string
2019-08-09 13:05:47 -05:00
2022-03-18 16:08:23 -05:00
makeToken := func ( quoted rune ) bool {
2019-08-09 13:05:47 -05:00
l . token . Text = string ( val )
2022-03-18 16:08:23 -05:00
l . token . wasQuoted = quoted
2023-02-25 19:34:27 -05:00
l . token . heredocMarker = heredocMarker
2019-08-09 13:05:47 -05:00
return true
}
for {
2023-02-25 19:34:27 -05:00
// Read a character in; if err then if we had
// read some characters, make a token. If we
// reached EOF, then no more tokens to read.
// If no EOF, then we had a problem.
2019-08-09 13:05:47 -05:00
ch , _ , err := l . reader . ReadRune ( )
if err != nil {
if len ( val ) > 0 {
2023-02-25 19:34:27 -05:00
if inHeredoc {
return false , fmt . Errorf ( "incomplete heredoc <<%s on line #%d, expected ending marker %s" , heredocMarker , l . line + l . skippedLines , heredocMarker )
}
return makeToken ( 0 ) , nil
2019-08-09 13:05:47 -05:00
}
if err == io . EOF {
2023-02-25 19:34:27 -05:00
return false , nil
2019-08-09 13:05:47 -05:00
}
2023-02-25 19:34:27 -05:00
return false , err
2019-08-09 13:05:47 -05:00
}
2023-02-25 19:34:27 -05:00
// detect whether we have the start of a heredoc
2023-08-19 05:32:32 -05:00
if ! ( quoted || btQuoted ) && ! ( inHeredoc || heredocEscaped ) &&
len ( val ) > 1 && string ( val [ : 2 ] ) == "<<" {
// a space means it's just a regular token and not a heredoc
if ch == ' ' {
return makeToken ( 0 ) , nil
2023-02-25 19:34:27 -05:00
}
2023-08-19 05:32:32 -05:00
// skip CR, we only care about LF
2023-02-25 19:34:27 -05:00
if ch == '\r' {
continue
}
2023-08-19 05:32:32 -05:00
2023-02-25 19:34:27 -05:00
// after hitting a newline, we know that the heredoc marker
// is the characters after the two << and the newline.
// we reset the val because the heredoc is syntax we don't
// want to keep.
if ch == '\n' {
2023-08-23 22:27:57 -05:00
if len ( val ) == 2 {
return false , fmt . Errorf ( "missing opening heredoc marker on line #%d; must contain only alpha-numeric characters, dashes and underscores; got empty string" , l . line )
}
2023-08-19 05:32:32 -05:00
// check if there's too many <
if string ( val [ : 3 ] ) == "<<<" {
return false , fmt . Errorf ( "too many '<' for heredoc on line #%d; only use two, for example <<END" , l . line )
}
2023-02-25 19:34:27 -05:00
heredocMarker = string ( val [ 2 : ] )
if ! heredocMarkerRegexp . Match ( [ ] byte ( heredocMarker ) ) {
return false , fmt . Errorf ( "heredoc marker on line #%d must contain only alpha-numeric characters, dashes and underscores; got '%s'" , l . line , heredocMarker )
}
inHeredoc = true
l . skippedLines ++
val = nil
continue
}
val = append ( val , ch )
continue
}
// if we're in a heredoc, all characters are read as-is
if inHeredoc {
val = append ( val , ch )
if ch == '\n' {
l . skippedLines ++
}
// check if we're done, i.e. that the last few characters are the marker
2024-01-25 09:55:00 -05:00
if len ( val ) >= len ( heredocMarker ) && heredocMarker == string ( val [ len ( val ) - len ( heredocMarker ) : ] ) {
2023-02-25 19:34:27 -05:00
// set the final value
val , err = l . finalizeHeredoc ( val , heredocMarker )
if err != nil {
return false , err
}
// set the line counter, and make the token
l . line += l . skippedLines
l . skippedLines = 0
return makeToken ( '<' ) , nil
}
// stay in the heredoc until we find the ending marker
continue
}
// track whether we found an escape '\' for the next
// iteration to be contextually aware
2020-05-05 13:27:49 -05:00
if ! escaped && ! btQuoted && ch == '\\' {
2019-09-28 22:18:36 -05:00
escaped = true
continue
}
2020-05-05 13:27:49 -05:00
if quoted || btQuoted {
if quoted && escaped {
2019-09-28 22:18:36 -05:00
// all is literal in quoted area,
// so only escape quotes
if ch != '"' {
val = append ( val , '\\' )
}
escaped = false
} else {
2023-02-25 19:34:27 -05:00
if ( quoted && ch == '"' ) || ( btQuoted && ch == '`' ) {
return makeToken ( ch ) , nil
2019-08-09 13:05:47 -05:00
}
}
2023-02-25 19:34:27 -05:00
// allow quoted text to wrap continue on multiple lines
2019-08-09 13:05:47 -05:00
if ch == '\n' {
2019-09-28 22:18:36 -05:00
l . line += 1 + l . skippedLines
l . skippedLines = 0
2019-08-09 13:05:47 -05:00
}
2023-02-25 19:34:27 -05:00
// collect this character as part of the quoted token
2019-08-09 13:05:47 -05:00
val = append ( val , ch )
continue
}
if unicode . IsSpace ( ch ) {
2023-02-25 19:34:27 -05:00
// ignore CR altogether, we only actually care about LF (\n)
2019-08-09 13:05:47 -05:00
if ch == '\r' {
continue
}
2023-02-25 19:34:27 -05:00
// end of the line
2019-08-09 13:05:47 -05:00
if ch == '\n' {
2023-02-25 19:34:27 -05:00
// newlines can be escaped to chain arguments
// onto multiple lines; else, increment the line count
2019-09-28 22:18:36 -05:00
if escaped {
l . skippedLines ++
escaped = false
} else {
l . line += 1 + l . skippedLines
l . skippedLines = 0
}
2023-02-25 19:34:27 -05:00
// comments (#) are single-line only
2019-08-09 13:05:47 -05:00
comment = false
}
2023-02-25 19:34:27 -05:00
// any kind of space means we're at the end of this token
2019-08-09 13:05:47 -05:00
if len ( val ) > 0 {
2023-02-25 19:34:27 -05:00
return makeToken ( 0 ) , nil
2019-08-09 13:05:47 -05:00
}
continue
}
2023-02-25 19:34:27 -05:00
// comments must be at the start of a token,
// in other words, preceded by space or newline
2020-05-05 13:32:12 -05:00
if ch == '#' && len ( val ) == 0 {
2019-08-09 13:05:47 -05:00
comment = true
}
if comment {
continue
}
if len ( val ) == 0 {
l . token = Token { Line : l . line }
if ch == '"' {
quoted = true
continue
}
2020-05-05 13:27:49 -05:00
if ch == '`' {
btQuoted = true
continue
}
2019-08-09 13:05:47 -05:00
}
2019-10-15 17:05:53 -05:00
if escaped {
2023-02-25 19:34:27 -05:00
// allow escaping the first < to skip the heredoc syntax
if ch == '<' {
heredocEscaped = true
} else {
val = append ( val , '\\' )
}
2019-10-15 17:05:53 -05:00
escaped = false
}
2019-08-09 13:05:47 -05:00
val = append ( val , ch )
}
}
2020-07-20 14:55:51 -05:00
2023-02-25 19:34:27 -05:00
// finalizeHeredoc takes the runes read as the heredoc text and the marker,
// and processes the text to strip leading whitespace, returning the final
// value without the leading whitespace.
func ( l * lexer ) finalizeHeredoc ( val [ ] rune , marker string ) ( [ ] rune , error ) {
2023-02-26 16:56:48 -05:00
stringVal := string ( val )
2023-02-25 19:34:27 -05:00
// find the last newline of the heredoc, which is where the contents end
2023-02-26 16:56:48 -05:00
lastNewline := strings . LastIndex ( stringVal , "\n" )
2023-02-25 19:34:27 -05:00
// collapse the content, then split into separate lines
2023-02-26 16:56:48 -05:00
lines := strings . Split ( stringVal [ : lastNewline + 1 ] , "\n" )
2023-02-25 19:34:27 -05:00
// figure out how much whitespace we need to strip from the front of every line
// by getting the string that precedes the marker, on the last line
2023-02-26 16:56:48 -05:00
paddingToStrip := stringVal [ lastNewline + 1 : len ( stringVal ) - len ( marker ) ]
2023-02-25 19:34:27 -05:00
// iterate over each line and strip the whitespace from the front
var out string
for lineNum , lineText := range lines [ : len ( lines ) - 1 ] {
2024-01-21 21:24:49 -05:00
if lineText == "" || lineText == "\r" {
2024-01-18 22:57:18 -05:00
out += "\n"
continue
}
2023-02-25 19:34:27 -05:00
// find an exact match for the padding
index := strings . Index ( lineText , paddingToStrip )
// if the padding doesn't match exactly at the start then we can't safely strip
if index != 0 {
return nil , fmt . Errorf ( "mismatched leading whitespace in heredoc <<%s on line #%d [%s], expected whitespace [%s] to match the closing marker" , marker , l . line + lineNum + 1 , lineText , paddingToStrip )
}
// strip, then append the line, with the newline, to the output.
// also removes all "\r" because Windows.
out += strings . ReplaceAll ( lineText [ len ( paddingToStrip ) : ] + "\n" , "\r" , "" )
2020-07-20 14:55:51 -05:00
}
2023-02-25 19:34:27 -05:00
2023-02-26 16:56:48 -05:00
// Remove the trailing newline from the loop
if len ( out ) > 0 && out [ len ( out ) - 1 ] == '\n' {
out = out [ : len ( out ) - 1 ]
}
2023-02-25 19:34:27 -05:00
// return the final value
return [ ] rune ( out ) , nil
}
2022-09-01 22:12:37 -05:00
func ( t Token ) Quoted ( ) bool {
return t . wasQuoted > 0
}
2023-02-25 19:34:27 -05:00
// NumLineBreaks counts how many line breaks are in the token text.
func ( t Token ) NumLineBreaks ( ) int {
lineBreaks := strings . Count ( t . Text , "\n" )
if t . wasQuoted == '<' {
// heredocs have an extra linebreak because the opening
2023-02-26 16:56:48 -05:00
// delimiter is on its own line and is not included in the
// token Text itself, and the trailing newline is removed.
lineBreaks += 2
2023-02-25 19:34:27 -05:00
}
return lineBreaks
}
var heredocMarkerRegexp = regexp . MustCompile ( "^[A-Za-z0-9_-]+$" )
2023-07-12 15:32:22 -05:00
// isNextOnNewLine tests whether t2 is on a different line from t1
func isNextOnNewLine ( t1 , t2 Token ) bool {
// If the second token is from a different file,
// we can assume it's from a different line
if t1 . File != t2 . File {
return true
}
// If the second token is from a different import chain,
// we can assume it's from a different line
if len ( t1 . imports ) != len ( t2 . imports ) {
return true
}
for i , im := range t1 . imports {
if im != t2 . imports [ i ] {
return true
}
}
// If the first token (incl line breaks) ends
// on a line earlier than the next token,
// then the second token is on a new line
return t1 . Line + t1 . NumLineBreaks ( ) < t2 . Line
}