From 8ead10c82e88b8f598fcb9b71cfae6fbe7978260 Mon Sep 17 00:00:00 2001 From: Moritz Marquardt Date: Fri, 19 Mar 2021 20:30:08 +0100 Subject: [PATCH] Implement SEO optimizations and improve error handling and branch detection --- handler.go | 162 ++++++++++++++++++++++++++++++++++++----------------- main.go | 20 ------- 2 files changed, 111 insertions(+), 71 deletions(-) diff --git a/handler.go b/handler.go index be78499..adf5c25 100644 --- a/handler.go +++ b/handler.go @@ -63,16 +63,58 @@ func handler(ctx *fasthttp.RequestCtx) { TryIndexPages: true, } + // tryBranch checks if a branch exists and populates the target variables. If canonicalLink is non-empty, it will + // also disallow search indexing and add a Link header to the canonical URL. + var tryBranch = func(repo string, branch string, path []string, canonicalLink string) bool { + if repo == "" { + return false + } + fmt.Printf("Trying branch: %s/%s/%s with path %v\n", targetOwner, repo, branch, path) + + escapedBranch, _ := url.PathUnescape(branch) + if escapedBranch == "" { + escapedBranch = branch + } + // Check if the branch exists, otherwise treat it as a file path + targetBranch, targetOptions.BranchTimestamp = getBranchTimestamp(targetOwner, repo, branch) + fmt.Printf("Branch %s has timestamp %v\n", targetBranch, targetOptions.BranchTimestamp) + if targetOptions.BranchTimestamp != (time.Time{}) { + // Branch exists, use it + targetRepo = repo + targetPath = strings.Trim(strings.Join(path, "/"), "/") + + if canonicalLink != "" { + // Hide from search machines & add canonical link + ctx.Response.Header.Set("X-Robots-Tag", "noarchive, noindex") + ctx.Response.Header.Set("Link", + strings.NewReplacer("%b", targetBranch, "%p", targetPath).Replace(canonicalLink)+ + "; rel=\"canonical\"", + ) + } + + return true + } else { + // branch doesn't exist + return false + } + } + + // tryUpstream forwards the target request to the Gitea API, and shows an error page on failure. + var tryUpstream = func() { + // Try to request the file from the Gitea API + if !upstream(ctx, targetOwner, targetRepo, targetBranch, targetPath, targetOptions) { + returnErrorPage(ctx, ctx.Response.StatusCode()) + } + } + if RawDomain != nil && bytes.Equal(ctx.Request.Host(), RawDomain) { // Serve raw content from RawDomain - // TODO: add canonical link and "X-Robots-Tag: noarchive, noindex" - targetOptions.TryIndexPages = false targetOptions.ForbiddenMimeTypes["text/html"] = struct{}{} targetOptions.DefaultMimeType = "text/plain; charset=utf-8" - pathElements := strings.SplitN(string(bytes.Trim(ctx.Request.URI().Path(), "/")), "/", 4) + pathElements := strings.Split(string(bytes.Trim(ctx.Request.URI().Path(), "/")), "/") if len(pathElements) < 2 { // https://{RawDomain}/{owner}/{repo}[/@{branch}]/{path} is required ctx.Redirect(RawInfoPage, fasthttp.StatusTemporaryRedirect) @@ -80,46 +122,74 @@ func handler(ctx *fasthttp.RequestCtx) { } targetOwner = pathElements[0] targetRepo = pathElements[1] - if len(pathElements) > 3 { - targetPath = strings.Trim(pathElements[2]+"/"+pathElements[3], "/") - } else if len(pathElements) > 2 { - targetPath = pathElements[2] - } // raw.codeberg.page/example/myrepo/@main/index.html - if len(pathElements) > 3 && strings.HasPrefix(pathElements[2], "@") { - branch, _ := url.PathUnescape(pathElements[2][1:]) - if branch == "" { - branch = pathElements[2][1:] - } - // Check if the branch exists, otherwise treat it as a file path - targetBranch, targetOptions.BranchTimestamp = getBranchTimestamp(targetOwner, targetRepo, branch) - if targetOptions.BranchTimestamp != (time.Time{}) { - targetPath = strings.Trim(pathElements[3], "/") // branch exists, use it - } else { - targetBranch = "" // branch doesn't exist, use default branch + if len(pathElements) > 2 && strings.HasPrefix(pathElements[2], "@") { + if tryBranch(targetRepo, pathElements[2][1:], pathElements[3:], + string(GiteaRoot)+"/"+targetOwner+"/"+targetRepo+"/src/branch/%b/%p", + ) { + tryUpstream() + return } + returnErrorPage(ctx, fasthttp.StatusFailedDependency) + return + } else { + tryBranch(targetRepo, "", pathElements[2:], + string(GiteaRoot)+"/"+targetOwner+"/"+targetRepo+"/src/branch/%b/%p", + ) + tryUpstream() + return } } else if bytes.HasSuffix(ctx.Request.Host(), MainDomainSuffix) { // Serve pages from subdomains of MainDomainSuffix - // TODO: add @branch syntax with "X-Robots-Tag: noarchive, noindex" - - pathElements := strings.SplitN(string(bytes.Trim(ctx.Request.URI().Path(), "/")), "/", 2) + pathElements := strings.Split(string(bytes.Trim(ctx.Request.URI().Path(), "/")), "/") targetOwner = string(bytes.TrimSuffix(ctx.Request.Host(), MainDomainSuffix)) targetRepo = pathElements[0] - if len(pathElements) > 1 { - targetPath = strings.Trim(pathElements[1], "/") + targetPath = strings.Trim(strings.Join(pathElements[1:], "/"), "/") + + // Check if the first directory is a repo with the second directory as a branch + // example.codeberg.page/myrepo/@main/index.html + if len(pathElements) > 1 && strings.HasPrefix(pathElements[1], "@") { + if tryBranch(pathElements[0], pathElements[1][1:], pathElements[2:], + "/"+pathElements[0]+"/%p", + ) { + tryUpstream() + } else { + returnErrorPage(ctx, fasthttp.StatusFailedDependency) + } + return + } + + // Check if the first directory is a branch for the "pages" repo + // example.codeberg.page/@main/index.html + if strings.HasPrefix(pathElements[0], "@") { + if tryBranch("pages", pathElements[0][1:], pathElements[1:], "/%p") { + tryUpstream() + } else { + returnErrorPage(ctx, fasthttp.StatusFailedDependency) + } + return } // Check if the first directory is a repo with a "pages" branch - targetBranch, targetOptions.BranchTimestamp = getBranchTimestamp(targetOwner, targetRepo, "pages") - if targetOptions.BranchTimestamp == (time.Time{}) { - targetRepo = "pages" - targetBranch = "" - targetPath = strings.Trim(pathElements[0]+"/"+targetPath, "/") + // example.codeberg.page/myrepo/index.html + if tryBranch(pathElements[0], "pages", pathElements[1:], "") { + tryUpstream() + return } + + // Try to use the "pages" repo on its default branch + // example.codeberg.page/index.html + if tryBranch("pages", "", pathElements, "") { + tryUpstream() + return + } + + // Couldn't find a valid repo/branch + returnErrorPage(ctx, fasthttp.StatusFailedDependency) + return } else { // Serve pages from external domains @@ -129,23 +199,6 @@ func handler(ctx *fasthttp.RequestCtx) { return } } - - // Check if a username can't exist because it's reserved (we'd risk to hit a Gitea route in that case) - if _, ok := ReservedUsernames[targetOwner]; ok { - returnErrorPage(ctx, fasthttp.StatusForbidden) - return - } - - // Check for blob path - if strings.HasPrefix(targetPath, "blob/") { - returnErrorPage(ctx, fasthttp.StatusForbidden) - return - } - - // Try to request the file from the Gitea API - if !upstream(ctx, targetOwner, targetRepo, targetBranch, targetPath, targetOptions) { - returnErrorPage(ctx, ctx.Response.StatusCode()) - } } // returnErrorPage sets the response status code and writes NotFoundPage to the response body, with "%status" replaced @@ -153,7 +206,7 @@ func handler(ctx *fasthttp.RequestCtx) { func returnErrorPage(ctx *fasthttp.RequestCtx, code int) { ctx.Response.SetStatusCode(code) ctx.Response.Header.SetContentType("text/html; charset=utf-8") - ctx.Response.SetBody(bytes.ReplaceAll(NotFoundPage, []byte("%status"), []byte(strconv.Itoa(code) + " " + fasthttp.StatusMessage(code)))) + ctx.Response.SetBody(bytes.ReplaceAll(NotFoundPage, []byte("%status"), []byte(strconv.Itoa(code)+" "+fasthttp.StatusMessage(code)))) } // getBranchTimestamp finds the default branch (if branch is "") and returns the last modification time of the branch @@ -163,8 +216,9 @@ func getBranchTimestamp(owner, repo, branch string) (branchWithFallback string, branchWithFallback = branch if branch == "" { var body = make([]byte, 0) - status, body, err := fasthttp.GetTimeout(body, string(GiteaRoot)+"/api/v1/repos/"+owner+"/"+repo, 10*time.Second) + status, body, err := fasthttp.GetTimeout(body, string(GiteaRoot)+"/api/v1/repos/"+url.PathEscape(owner)+"/"+url.PathEscape(repo), 10*time.Second) if err != nil || status != 200 { + fmt.Printf("Default branch request to Gitea API failed with status code %d and error %s\n", status, err) branchWithFallback = "" return } @@ -173,8 +227,9 @@ func getBranchTimestamp(owner, repo, branch string) (branchWithFallback string, } var body = make([]byte, 0) - status, body, err := fasthttp.GetTimeout(body, string(GiteaRoot)+"/api/v1/repos/"+owner+"/"+repo+"/branches/"+branch, 10*time.Second) + status, body, err := fasthttp.GetTimeout(body, string(GiteaRoot)+"/api/v1/repos/"+url.PathEscape(owner)+"/"+url.PathEscape(repo)+"/branches/"+url.PathEscape(branch), 10*time.Second) if err != nil || status != 200 { + fmt.Printf("Branch info request to Gitea API failed with status code %d and error %s\n", status, err) branchWithFallback = "" return } @@ -196,12 +251,17 @@ func upstream(ctx *fasthttp.RequestCtx, targetOwner string, targetRepo string, t // Handle repositories with no/broken pages setup if options.BranchTimestamp == (time.Time{}) || targetBranch == "" { - ctx.Response.SetStatusCode(fasthttp.StatusNotFound) + ctx.Response.SetStatusCode(fasthttp.StatusFailedDependency) ctx.Response.Header.SetContentType("text/html; charset=utf-8") ctx.Response.SetBody(bytes.ReplaceAll(NotFoundPage, []byte("%status"), []byte("pages not set up for this repo"))) return true } + if targetOwner == "" || targetRepo == "" || targetBranch == "" { + returnErrorPage(ctx, fasthttp.StatusBadRequest) + return true + } + // Check if the browser has a cached version if ifModifiedSince, err := time.Parse(time.RFC1123, string(ctx.Request.Header.Peek("If-Modified-Since"))); err == nil { if !ifModifiedSince.Before(options.BranchTimestamp) { @@ -212,7 +272,7 @@ func upstream(ctx *fasthttp.RequestCtx, targetOwner string, targetRepo string, t // Make a GET request to the upstream URL req := fasthttp.AcquireRequest() - req.SetRequestURI(string(GiteaRoot) + "/api/v1/repos/" + targetOwner + "/" + targetRepo + "/raw/" + targetBranch + "/" + targetPath) + req.SetRequestURI(string(GiteaRoot) + "/api/v1/repos/" + url.PathEscape(targetOwner) + "/" + url.PathEscape(targetRepo) + "/raw/" + url.PathEscape(targetBranch) + "/" + url.PathEscape(targetPath)) res := fasthttp.AcquireResponse() err := fasthttp.DoTimeout(req, res, 10*time.Second) diff --git a/main.go b/main.go index ae4ac7b..a7081a4 100644 --- a/main.go +++ b/main.go @@ -20,7 +20,6 @@ import ( "fmt" "net" "os" - "strings" "time" _ "embed" @@ -67,14 +66,6 @@ var IndexPages = []string{ "index.html", } -// ReservedUsernames specifies the usernames that are reserved by Gitea and thus may not be used as owner names. -// The contents are taken from https://github.com/go-gitea/gitea/blob/master/models/user.go#L783; reserved names with -// dots are removed as they are forbidden for Codeberg Pages anyways. -var ReservedUsernames = createLookupMapFromWords(` - admin api assets attachments avatars captcha commits debug error explore ghost help install issues less login metrics milestones new notifications org plugins pulls raw repo search stars template user - -`) - // main sets up and starts the web server. func main() { // Make sure MainDomain has a trailing dot, and GiteaRoot has no trailing slash @@ -122,14 +113,3 @@ func envOr(env string, or string) string { } return or } - -func createLookupMapFromWords(input string) map[string]struct{} { - var res = map[string]struct{}{} - input = strings.NewReplacer("\t", " ", "\n", " ", "\r", " ").Replace(input) - for _, word := range strings.Split(input, " ") { - if len(word) > 0 { - res[word] = struct{}{} - } - } - return res -}