// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package gitdiff
import (
"bytes"
"html/template"
"strings"
"github.com/sergi/go-diff/diffmatchpatch"
)
// extractDiffTokenRemainingFullTag tries to extract full tag with content from the remaining string
// e.g. for input: "contentthe-rest...", it returns "content", "the-rest...", true
func extractDiffTokenRemainingFullTag(s string) (token, after string, valid bool) {
pos := 0
for ; pos < len(s); pos++ {
c := s[pos]
if c == '<' {
break
}
// keep in mind: even if we'd like to relax this check,
// we should never ignore "&" because it is for HTML entity and can't be safely used in the diff algorithm,
// because diff between "<" and ">" will generate broken result.
isSymbolChar := 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' || c == '_' || c == '-'
if !isSymbolChar {
return "", s, false
}
}
if pos+1 >= len(s) || s[pos+1] != '/' {
return "", s, false
}
pos2 := strings.IndexByte(s[pos:], '>')
if pos2 == -1 {
return "", s, false
}
return s[:pos+pos2+1], s[pos+pos2+1:], true
}
// Returned token:
// * full tag with content: "<content>", it is used to optimize diff results to highlight the whole changed symbol
// * opening/close tag: "" or ""
// * HTML entity: "<"
func extractDiffToken(s string) (before, token, after string, valid bool) {
for pos1 := 0; pos1 < len(s); pos1++ {
switch s[pos1] {
case '<':
pos2 := strings.IndexByte(s[pos1:], '>')
if pos2 == -1 {
return "", "", s, false
}
before, token, after = s[:pos1], s[pos1:pos1+pos2+1], s[pos1+pos2+1:]
if !strings.HasPrefix(token, "") {
// try to extract full tag with content, e.g. `<content>`, to optimize diff results
if fullTokenRemaining, fullTokenAfter, ok := extractDiffTokenRemainingFullTag(after); ok {
return before, "<" + token + fullTokenRemaining + ">", fullTokenAfter, true
}
}
return before, token, after, true
case '&':
pos2 := strings.IndexByte(s[pos1:], ';')
if pos2 == -1 {
return "", "", s, false
}
return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
}
}
return "", "", s, true
}
// highlightCodeDiff is used to do diff with highlighted HTML code.
// It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
// The HTML tags and entities will be replaced by Unicode placeholders: "{TEXT}" => "\uE000{TEXT}\uE001"
// These Unicode placeholders are friendly to the diff.
// Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities.
// It's guaranteed that the tags in final diff result are paired correctly.
type highlightCodeDiff struct {
placeholderBegin rune
placeholderMaxCount int
placeholderIndex int
placeholderTokenMap map[rune]string
tokenPlaceholderMap map[string]rune
placeholderOverflowCount int
}
func newHighlightCodeDiff() *highlightCodeDiff {
return &highlightCodeDiff{
placeholderBegin: rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD)
placeholderMaxCount: 64000,
placeholderTokenMap: map[rune]string{},
tokenPlaceholderMap: map[string]rune{},
}
}
// nextPlaceholder returns 0 if no more placeholder can be used
// the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line
// so the placeholderMaxCount is impossible to be exhausted in real cases.
func (hcd *highlightCodeDiff) nextPlaceholder() rune {
for hcd.placeholderIndex < hcd.placeholderMaxCount {
r := hcd.placeholderBegin + rune(hcd.placeholderIndex)
hcd.placeholderIndex++
// only use non-existing (not used by code) rune as placeholders
if _, ok := hcd.placeholderTokenMap[r]; !ok {
return r
}
}
return 0 // no more available placeholder
}
func (hcd *highlightCodeDiff) isInPlaceholderRange(r rune) bool {
return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount)
}
func (hcd *highlightCodeDiff) collectUsedRunes(code template.HTML) {
for _, r := range code {
if hcd.isInPlaceholderRange(r) {
// put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore.
hcd.placeholderTokenMap[r] = ""
}
}
}
func (hcd *highlightCodeDiff) diffLineWithHighlight(lineType DiffLineType, codeA, codeB template.HTML) template.HTML {
hcd.collectUsedRunes(codeA)
hcd.collectUsedRunes(codeB)
convertedCodeA := hcd.convertToPlaceholders(codeA)
convertedCodeB := hcd.convertToPlaceholders(codeB)
dmp := defaultDiffMatchPatch()
diffs := dmp.DiffMain(convertedCodeA, convertedCodeB, true)
diffs = dmp.DiffCleanupSemantic(diffs)
buf := bytes.NewBuffer(nil)
addedCodePrefix := hcd.registerTokenAsPlaceholder(``)
addedCodeSuffix := hcd.registerTokenAsPlaceholder(``)
removedCodePrefix := hcd.registerTokenAsPlaceholder(``)
removedCodeSuffix := hcd.registerTokenAsPlaceholder(``)
if removedCodeSuffix != 0 {
for _, diff := range diffs {
switch {
case diff.Type == diffmatchpatch.DiffEqual:
buf.WriteString(diff.Text)
case diff.Type == diffmatchpatch.DiffInsert && lineType == DiffLineAdd:
buf.WriteRune(addedCodePrefix)
buf.WriteString(diff.Text)
buf.WriteRune(addedCodeSuffix)
case diff.Type == diffmatchpatch.DiffDelete && lineType == DiffLineDel:
buf.WriteRune(removedCodePrefix)
buf.WriteString(diff.Text)
buf.WriteRune(removedCodeSuffix)
}
}
} else {
// placeholder map space is exhausted
for _, diff := range diffs {
take := diff.Type == diffmatchpatch.DiffEqual || (diff.Type == diffmatchpatch.DiffInsert && lineType == DiffLineAdd) || (diff.Type == diffmatchpatch.DiffDelete && lineType == DiffLineDel)
if take {
buf.WriteString(diff.Text)
}
}
}
return hcd.recoverOneDiff(buf.String())
}
func (hcd *highlightCodeDiff) registerTokenAsPlaceholder(token string) rune {
placeholder, ok := hcd.tokenPlaceholderMap[token]
if !ok {
placeholder = hcd.nextPlaceholder()
if placeholder != 0 {
hcd.tokenPlaceholderMap[token] = placeholder
hcd.placeholderTokenMap[placeholder] = token
}
}
return placeholder
}
// convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
func (hcd *highlightCodeDiff) convertToPlaceholders(htmlContent template.HTML) string {
var tagStack []string
res := strings.Builder{}
htmlCode := strings.TrimSpace(string(htmlContent))
// the standard chroma highlight HTML is ` ... `
// the line wrapper tags should be removed before diff
if strings.HasPrefix(htmlCode, `")
}
var beforeToken, token string
var valid bool
for {
beforeToken, token, htmlCode, valid = extractDiffToken(htmlCode)
if !valid || token == "" {
break
}
// write the content before the token into result string, and consume the token in the string
res.WriteString(beforeToken)
var tokenInMap string
if strings.HasPrefix(token, "") { // for closing tag
if len(tagStack) == 0 {
break // invalid diff result, no opening tag but see closing tag
}
// make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags
// the closing tag will be recorded in the map by key "" for ""
tokenInMap = token + ""
tagStack = tagStack[:len(tagStack)-1]
} else if token[0] == '<' {
if token[1] == '<' {
// full tag `<content>`, recover to `content`
tokenInMap = token
} else {
// opening tag
tokenInMap = token
tagStack = append(tagStack, token)
}
} else if token[0] == '&' { // for HTML entity
tokenInMap = token
} // else: impossible
// remember the placeholder and token in the map
placeholder := hcd.registerTokenAsPlaceholder(tokenInMap)
if placeholder != 0 {
res.WriteRune(placeholder) // use the placeholder to replace the token
} else {
// unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting
// usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma.
hcd.placeholderOverflowCount++
if strings.HasPrefix(token, "<<") {
pos1 := strings.IndexByte(token, '>')
pos2 := strings.LastIndexByte(token, '<')
res.WriteString(token[pos1+1 : pos2]) // recover to `content` from "<content>"
}
if strings.HasPrefix(token, "&") {
// when the token is an HTML entity, something must be outputted even if there is no placeholder.
res.WriteRune(0xFFFD) // replacement character TODO: how to handle this case more gracefully?
res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result.
}
}
}
// write the remaining string
res.WriteString(htmlCode)
return res.String()
}
func (hcd *highlightCodeDiff) recoverOneDiff(str string) template.HTML {
sb := strings.Builder{}
var tagStack []string
for _, r := range str {
token, ok := hcd.placeholderTokenMap[r]
if !ok || token == "" {
sb.WriteRune(r) // if the rune is not a placeholder, write it as it is
continue
}
var tokenToRecover string
if strings.HasPrefix(token, "") { // for closing tag
// only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function)
tokenToRecover = token[:strings.IndexByte(token, '>')+1]
if len(tagStack) == 0 {
continue // if no opening tag in stack yet, skip the closing tag
}
tagStack = tagStack[:len(tagStack)-1]
} else if token[0] == '<' { // for HTML tag
if token[1] == '<' {
// full tag `<content>`, recover to `content`
tokenToRecover = token[1 : len(token)-1]
} else {
// opening tag
tokenToRecover = token
tagStack = append(tagStack, token)
}
} else if token[0] == '&' { // for HTML entity
tokenToRecover = token
} // else: impossible
sb.WriteString(tokenToRecover)
}
if len(tagStack) > 0 {
// close all opening tags
for i := len(tagStack) - 1; i >= 0; i-- {
tagToClose := tagStack[i]
// get the closing tag "" from "" or ""
pos := strings.IndexAny(tagToClose, " >")
if pos != -1 {
sb.WriteString("" + tagToClose[1:pos] + ">")
} // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag
}
}
return template.HTML(sb.String())
}