From e7afcc9f77852b40060f7a742c83886c127ffedf Mon Sep 17 00:00:00 2001 From: Evgeny Kuznetsov Date: Mon, 27 Jan 2020 20:15:39 +0300 Subject: isolate HTML cleaning --- main.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'main.go') diff --git a/main.go b/main.go index 23f6eac..0c8baf4 100644 --- a/main.go +++ b/main.go @@ -17,13 +17,14 @@ package main import ( "flag" - "github.com/gorilla/feeds" "io/ioutil" "log" "net/http" "regexp" "strconv" "time" + + "github.com/gorilla/feeds" ) type subst struct { @@ -32,7 +33,7 @@ type subst struct { } var ( - substitutes = []subst{ + substitutes = []subst{ // these need to be changed to show up properly in the feed {from: `"`, to: `"`}, {from: `–`, to: `–`}, } @@ -147,9 +148,17 @@ func getPage(pageUrl string) []byte { if err != nil { log.Fatal(err) } + + page = cleanText(page) + + return page +} + +// cleanText replaces HTML-encoded symbols with proper UTF +func cleanText(b []byte) []byte { for _, sub := range substitutes { re := regexp.MustCompile(sub.from) - page = re.ReplaceAll(page, []byte(sub.to)) + b = re.ReplaceAll(b, []byte(sub.to)) } - return page + return b } -- cgit v1.2.3