I am very, very memory careful as I have to write programs that need to cope with massive datasets.
Currently my application quickly reaches 32GB of memory, starts swapping, and then gets killed by the system.
I do not understand how this can be since all variables are collectable (in functions and quickly released) except TokensStruct and TokensCount in the Trainer struct. TokensCount is just a uint. TokensStruct is a 1,000,000 row slice of [5]uint32 and string, so that means 20 bytes + string, which we could call a maximum of 50 bytes per record. 50*1000000 = 50MB of memory required. So this script should therefore not use much more than 50MB + overhead + temporary collectable variables in the functions (maybe another 50MB max.) The maximum potential size of TokensStruct is 5,000,000, as this is the size of dictionary, but even then it would be only 250MB of memory. dictionary is a map and apparently uses around 600MB of memory, as that is how the app starts, but this is not an issue because dictionary is only loaded once and never written to again.
Instead it uses 32GB of memory then dies. By the speed that it does this I expect it would happily get to 1TB of memory if it could. The memory appears to increase in a linear fashion with the size of the files being loaded, meaning that it appears to never clear any memory at all. Everything that enters the app is allocated more memory and memory is never freed.
I tried implementing runtime.GC() in case the garbage collection wasn't running often enough, but this made no difference.
Since the memory usage increases in a linear fashion then this would imply that there is a memory leak in GetTokens() or LoadZip(). I don't know how this could be, since they are both functions and only do one task and then close. Or it could be that the tokens variable in Start() is the cause of the leak. Basically it looks like every file that is loaded and parsed is never released from memory, as that is the only way that the memory could fill up in a linear fashion and keep on rising up to 32GB++.
Absolute nightmare! What's wrong with Go? Any way to fix this?
package main
import (
"bytes"
"code.google.com/p/go.text/transform"
"code.google.com/p/go.text/unicode/norm"
"compress/zlib"
"encoding/gob"
"fmt"
"github.com/AlasdairF/BinSearch"
"io/ioutil"
"os"
"regexp"
"runtime"
"strings"
"unicode"
"unicode/utf8"
)
type TokensStruct struct {
binsearch.Key_string
Value [][5]uint32
}
type Trainer struct {
Tokens TokensStruct
TokensCount uint
}
func checkErr(err error) {
if err == nil {
return
}
fmt.Println(`Some Error:`, err)
panic(err)
}
// Local helper function for normalization of UTF8 strings.
func isMn(r rune) bool {
return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks
}
// This map is used by RemoveAccents function to convert non-accented characters.
var transliterations = map[rune]string{'Æ': "E", 'Ð': "D", 'Ł': "L", 'Ø': "OE", 'Þ': "Th", 'ß': "ss", 'æ': "e", 'ð': "d", 'ł': "l", 'ø': "oe", 'þ': "th", 'Œ': "OE", 'œ': "oe"}
// removeAccentsBytes converts accented UTF8 characters into their non-accented equivalents, from a []byte.
func removeAccentsBytesDashes(b []byte) ([]byte, error) {
mnBuf := make([]byte, len(b))
t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
n, _, err := t.Transform(mnBuf, b, true)
if err != nil {
return nil, err
}
mnBuf = mnBuf[:n]
tlBuf := bytes.NewBuffer(make([]byte, 0, len(mnBuf)*2))
for i, w := 0, 0; i < len(mnBuf); i += w {
r, width := utf8.DecodeRune(mnBuf[i:])
if r == '-' {
tlBuf.WriteByte(' ')
} else {
if d, ok := transliterations[r]; ok {
tlBuf.WriteString(d)
} else {
tlBuf.WriteRune(r)
}
}
w = width
}
return tlBuf.Bytes(), nil
}
func LoadZip(filename string) ([]byte, error) {
// Open file for reading
fi, err := os.Open(filename)
if err != nil {
return nil, err
}
defer fi.Close()
// Attach ZIP reader
fz, err := zlib.NewReader(fi)
if err != nil {
return nil, err
}
defer fz.Close()
// Pull
data, err := ioutil.ReadAll(fz)
if err != nil {
return nil, err
}
return norm.NFC.Bytes(data), nil // return normalized
}
func getTokens(pibn string) []string {
var data []byte
var err error
data, err = LoadZip(`/storedir/` + pibn + `/text.zip`)
checkErr(err)
data, err = removeAccentsBytesDashes(data)
checkErr(err)
data = bytes.ToLower(data)
data = reg2.ReplaceAll(data, []byte("$2")) // remove contractions
data = reg.ReplaceAllLiteral(data, nil)
tokens := strings.Fields(string(data))
return tokens
}
func (t *Trainer) Start() {
data, err := ioutil.ReadFile(`list.txt`)
checkErr(err)
pibns := bytes.Fields(data)
for i, pibn := range pibns {
tokens := getTokens(string(pibn))
t.addTokens(tokens)
if i%100 == 0 {
runtime.GC() // I added this just to try to stop the memory craziness, but it makes no difference
}
}
}
func (t *Trainer) addTokens(tokens []string) {
for _, tok := range tokens {
if _, ok := dictionary[tok]; ok {
if indx, ok2 := t.Tokens.Find(tok); ok2 {
ar := t.Tokens.Value[indx]
ar[0]++
t.Tokens.Value[indx] = ar
t.TokensCount++
} else {
t.Tokens.AddKeyAt(tok, indx)
t.Tokens.Value = append(t.Tokens.Value, [5]uint32{0, 0, 0, 0, 0})
copy(t.Tokens.Value[indx+1:], t.Tokens.Value[indx:])
t.Tokens.Value[indx] = [5]uint32{1, 0, 0, 0, 0}
t.TokensCount++
}
}
}
return
}
func LoadDictionary() {
dictionary = make(map[string]bool)
data, err := ioutil.ReadFile(`dictionary`)
checkErr(err)
words := bytes.Fields(data)
for _, word := range words {
strword := string(word)
dictionary[strword] = false
}
}
var reg = regexp.MustCompile(`[^a-z0-9\s]`)
var reg2 = regexp.MustCompile(`\b(c|l|all|dall|dell|nell|sull|coll|pell|gl|agl|dagl|degl|negl|sugl|un|m|t|s|v|d|qu|n|j)'([a-z])`) //contractions
var dictionary map[string]bool
func main() {
trainer := new(Trainer)
LoadDictionary()
trainer.Start()
}
Make sure that if you're tokenizing from a large string, to avoid memory pinning. From the comments above, it sounds like the tokens are substrings of a large string.
You may need to add a little extra in your getTokens() function so it guarantees the tokens aren't pinning memory.
func getTokens(...) {
// near the end of your program
for i, t := range(tokens) {
tokens[i] = string([]byte(t))
}
}
By the way, reading the whole file into memory using ioutil.ReadFile all at once looks dubious. Are you sure you can't use bufio.Scanner?
I'm looking at the code more closely... if you are truly concerned about memory, take advantage of io.Reader. You should try to avoid sucking in the content of a whole file at once. Use io.Reader and the transform "along the grain". The way you're using it now is against the grain of its intent. The whole point of the transform package you're using is to construct flexible Readers that can stream through data.
For example, here's a simplification of what you're doing:
package main
import (
"bufio"
"bytes"
"fmt"
"unicode/utf8"
"code.google.com/p/go.text/transform"
)
type AccentsTransformer map[rune]string
func (a AccentsTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for nSrc < len(src) {
// If we're at the edge, note this and return.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
return
}
r, width := utf8.DecodeRune(src[nSrc:])
if r == utf8.RuneError && width == 1 {
err = fmt.Errorf("Decoding error")
return
}
if d, ok := a[r]; ok {
if nDst+len(d) > len(dst) {
err = transform.ErrShortDst
return
}
copy(dst[nDst:], d)
nSrc += width
nDst += len(d)
continue
}
if nDst+width > len(dst) {
err = transform.ErrShortDst
return
}
copy(dst[nDst:], src[nSrc:nSrc+width])
nDst += width
nSrc += width
}
return
}
func main() {
transliterations := AccentsTransformer{'Æ': "E", 'Ø': "OE"}
testString := "cØØl beÆns"
b := transform.NewReader(bytes.NewBufferString(testString), transliterations)
scanner := bufio.NewScanner(b)
scanner.Split(bufio.ScanWords)
for scanner.Scan() {
fmt.Println("token:", scanner.Text())
}
}
It becomes really easy then to chain transformers together. So, for example, if we wanted to remove all hyphens from the input stream, it's just a matter of using transform.Chain appropriately:
func main() {
transliterations := AccentsTransformer{'Æ': "E", 'Ø': "OE"}
removeHyphens := transform.RemoveFunc(func(r rune) bool {
return r == '-'
})
allTransforms := transform.Chain(transliterations, removeHyphens)
testString := "cØØl beÆns - the next generation"
b := transform.NewReader(bytes.NewBufferString(testString), allTransforms)
scanner := bufio.NewScanner(b)
scanner.Split(bufio.ScanWords)
for scanner.Scan() {
fmt.Println("token:", scanner.Text())
}
}
I have not exhaustively tested the code above, so please don't just copy-and-paste it without sufficient tests. :P I just cooked it up fast. But this kind of approach --- avoiding whole-file reading --- will scale better because it will read the file in chunks.
1 How large are "list.txt" and "dictionary"? If it is so large, No wonder the memory is so large
pibns := bytes.Fields(data)
how much is len(pibns)?
2 start the gc debug ( do GODEBUG="gctrace=1" ./yourprogram ) to see if there is any gc happening
3 do some profile like this:
func lookupMem(){
if f, err := os.Create("mem_prof"+time.Now.Unix()); err != nil {
log.Debug("record memory profile failed: %v", err)
} else {
runtime.GC()
pprof.WriteHeapProfile(f)
f.Close()
}
if f, err := os.Create("heap_prof" + "." + timestamp); err != nil {
log.Debug("heap profile failed:", err)
} else {
p := pprof.Lookup("heap")
p.WriteTo(f, 2)
}
}
func (t *Trainer) Start() {
.......
if i%1000==0 {
//if `len(pibns)` is not very large , record some meminfo
lookupMem()
}
.......
Related
I am working on a project where during startup I need to read certain files and store it in memory in a map and then periodically look for new files if there are any and then replace whatever I had in memory in the map earlier during startup with this new data. Basically every time if there is a new file which is a full state then I want to refresh my in memory map objects to this new one instead of appending to it.
Below method loadAtStartupAndProcessNewChanges is called during server startup which reads the file and store data in memory. Also it starts a go-routine detectNewFiles which periodically checks if there are any new files and store it on a deltaChan channel which is later accessed by another go-routine processNewFiles to read that new file again and store data in the same map. If there is any error then we store it on err channel. loadFiles is the function which will read files in memory and store it in map.
type customerConfig struct {
deltaChan chan string
err chan error
wg sync.WaitGroup
data *cmap.ConcurrentMap
}
// this is called during server startup.
func (r *customerConfig) loadAtStartupAndProcessNewChanges() error {
path, err := r.GetPath("...", "....")
if err != nil {
return err
}
r.wg.Add(1)
go r.detectNewFiles(path)
err = r.loadFiles(4, path)
if err != nil {
return err
}
r.wg.Add(1)
go r.processNewFiles()
return nil
}
This method basically figures out if there are any new files that needs to be consumed and if there is any then it will put it on the deltaChan channel which will be later on consumed by processNewFiles go-routine and read the file in memory. If there is any error then it will add error to the error channel.
func (r *customerConfig) detectNewFiles(rootPath string) {
}
This will read all s3 files and store it in memory and return error. In this method I clear previous state of my map so that it can have fresh state from new files. This method is called during server startup and also called whenever we need to process new files from processNewFiles go-routine.
func (r *customerConfig) loadFiles(workers int, path string) error {
var err error
...
var files []string
files = .....
// reset the map so that it can have fresh state from new files.
r.data.Clear()
g, ctx := errgroup.WithContext(context.Background())
sem := make(chan struct{}, workers)
for _, file := range files {
select {
case <-ctx.Done():
break
case sem <- struct{}{}:
}
file := file
g.Go(func() error {
defer func() { <-sem }()
return r.read(spn, file, bucket)
})
}
if err := g.Wait(); err != nil {
return err
}
return nil
}
This method read the files and add in the data concurrent map.
func (r *customerConfig) read(file string, bucket string) error {
// read file and store it in "data" concurrent map
// and if there is any error then return the error
var err error
fr, err := pars3.NewS3FileReader(context.Background(), bucket, file, r.s3Client.GetSession().Config)
if err != nil {
return errs.Wrap(err)
}
defer xio.CloseIgnoringErrors(fr)
pr, err := reader.NewParquetReader(fr, nil, 8)
if err != nil {
return errs.Wrap(err)
}
if pr.GetNumRows() == 0 {
spn.Infof("Skipping %s due to 0 rows", file)
return nil
}
for {
rows, err := pr.ReadByNumber(r.cfg.RowsToRead)
if err != nil {
return errs.Wrap(err)
}
if len(rows) <= 0 {
break
}
byteSlice, err := json.Marshal(rows)
if err != nil {
return errs.Wrap(err)
}
var invMods []CompModel
err = json.Unmarshal(byteSlice, &invMods)
if err != nil {
return errs.Wrap(err)
}
for i := range invMods {
key := strconv.FormatInt(invMods[i].ProductID, 10) + ":" + strconv.Itoa(int(invMods[i].Iaz))
hasInventory := false
if invMods[i].Available > 0 {
hasInventory = true
}
r.data.Set(key, hasInventory)
}
}
return nil
}
This method will pick what is there on the delta channel and if there are any new files then it will start reading that new file by calling loadFiles method. If there is any error then it will add error to the error channel.
// processNewFiles - load new files found by detectNewFiles
func (r *customerConfig) processNewFiles() {
// find new files on delta channel
// and call "loadFiles" method to read it
// if there is any error, then it will add it to the error channel.
}
If there is any error on the error channel then it will log those errors from below method -
func (r *customerConfig) handleError() {
// read error from error channel if there is any
// then log it
}
Problem Statement
Above logic works for me without any issues but there is one small bug in my code which I am not able to figure out on how to solve it. As you can see I have a concurrent map which I am populating in my read method and also clearing that whole map in loadFiles method. Because whenever there is a new file on delta channel I don't want to keep previous state in the map so that's why I am removing everything from the map and then adding new state from new files to it.
Now if there is any error in read method then the bug happens bcoz I have already cleared all the data in my data map which will have empty map which is not what I want. Basically if there is any error then I would like to preserve previous state in the data map. How can I resolve this issue in my above current design.
Note: I am using golang concurrent map
I think your design is over complicated. It can be solved much simpler, which gives all the benefits you desire:
safe for concurrent access
detected changes are reloaded
accessing the config gives you the most recent, successfully loaded config
the most recent config is always, immediately accessible, even if loading a new config due to detected changes takes long
if loading new config fails, the previous "snapshot" is kept and remains the current
as a bonus, it's much simpler and doesn't even use 3rd party libs
Let's see how to achieve this:
Have a CustomerConfig struct holding everything you want to cache (this is the "snapshot"):
type CustomerConfig struct {
Data map[string]bool
// Add other props if you need:
LoadedAt time.Time
}
Provide a function that loads the config you wish to cache. Note: this function is stateless, it does not access / operate on package level variables:
func loadConfig() (*CustomerConfig, error) {
cfg := &CustomerConfig{
Data: map[string]bool{},
LoadedAt: time.Now(),
}
// Logic to load files, and populate cfg.Data
// If an error occurs, return it
// If loading succeeds, return the config
return cfg, nil
}
Now let's create our "cache manager". The cache manager stores the actual / current config (the snapshot), and provides access to it. For safe concurrent access (and update), we use a sync.RWMutex. Also has means to stop the manager (to stop the concurrent refreshing):
type ConfigCache struct {
configMu sync.RWMutex
config *CustomerConfig
closeCh chan struct{}
}
Creating a cache loads the initial config. Also launches a goroutine that will be responsible to periodically check for changes.
func NewConfigCache() (*ConfigCache, error) {
cfg, err := loadConfig()
if err != nil {
return nil, fmt.Errorf("loading initial config failed: %w", err)
}
cc := &ConfigCache{
config: cfg,
closeCh: make(chan struct{}),
}
// launch goroutine to periodically check for changes, and load new configs
go cc.refresher()
return cc, nil
}
The refresher() periodically checks for changes, and if changes are detected, calls loadConfig() to load new data to be cached, and stores it as the current / actual config (while locking configMu). It also monitors closeCh to stop if that is requested:
func (cc *ConfigCache) refresher() {
ticker := time.NewTicker(1 * time.Minute) // Every minute
defer ticker.Stop()
for {
select {
case <-ticker.C:
// Check if there are changes
changes := false // logic to detect changes
if !changes {
continue // No changes, continue
}
// Changes! load new config:
cfg, err := loadConfig()
if err != nil {
log.Printf("Failed to load config: %v", err)
continue // Keep the previous config
}
// Apply / store new config
cc.configMu.Lock()
cc.config = cfg
cc.configMu.Unlock()
case <-cc.closeCh:
return
}
}
}
Closing the cache manager (the refresher goroutine) is as easy as:
func (cc *ConfigCache) Stop() {
close(cc.closeCh)
}
The last missing piece is how you access the current config. That's a simple GetConfig() method (that also uses configMu, but in read-only mode):
func (cc *ConfigCache) GetConfig() *CustomerConfig {
cc.configMu.RLock()
defer cc.configMu.RUnlock()
return cc.config
}
This is how you can use this:
cc, err := NewConfigCache()
if err != nil {
// Decide what to do: retry, terminate etc.
}
// Where ever, whenever you need the actual (most recent) config in your app:
cfg := cc.GetConfig()
// Use cfg
Before you shut down your app (or you want to stop the refreshing), you may call cc.Stop().
Added RWMutex for collectedData concurrent write protecting by worker goroutine
type customerConfig struct {
...
m sync.RWMutex
}
Instead of updating map in read method let read method just return the data and error
func (r *customerConfig) read(file string, bucket string) ([]CompModel, error) {
// read file data and return with error if any
var err error
fr, err := pars3.NewS3FileReader(context.Background(), bucket, file, r.s3Client.GetSession().Config)
if err != nil {
return (nil, errs.Wrap(err))
}
defer xio.CloseIgnoringErrors(fr)
pr, err := reader.NewParquetReader(fr, nil, 8)
if err != nil {
return (nil, errs.Wrap(err))
}
if pr.GetNumRows() == 0 {
spn.Infof("Skipping %s due to 0 rows", file)
return (nil, errors.New("No Data"))
}
var invMods = []CompModel{}
for {
rows, err := pr.ReadByNumber(r.cfg.RowsToRead)
if err != nil {
return (nil, errs.Wrap(err))
}
if len(rows) <= 0 {
break
}
byteSlice, err := json.Marshal(rows)
if err != nil {
return (nil, errs.Wrap(err))
}
var jsonData []CompModel
err = json.Unmarshal(byteSlice, &jsonData)
if err != nil {
return (nil, errs.Wrap(err))
}
invMods = append(invMods, jsonData...)
}
return invMods, nil
}
And then loadFiles you can collect the data return by read
method and if no error only then clear and update the map else
leave the old data as it was before
func (r *customerConfig) loadFiles(workers int, path string) error {
var err error
...
var files []string
files = .....
// reset the map so that it can have fresh state from new files.
// r.data.Clear() <- remove the clear from here
g, ctx := errgroup.WithContext(context.Background())
sem := make(chan struct{}, workers)
collectedData := []CompModel{}
for _, file := range files {
select {
case <-ctx.Done():
break
case sem <- struct{}{}:
}
file := file
g.Go(func() error {
defer func() { <-sem }()
data, err:= r.read(spn, file, bucket)
if err != nil {
return err
}
r.m.Lock()
append(collectedData, data...)
r.m.Unlock()
return nil
})
}
if err := g.Wait(); err != nil {
return err
}
r.data.Clear()
for i := range collectedData {
key := strconv.FormatInt(collectedData[i].ProductID, 10) + ":" + strconv.Itoa(int(collectedData[i].Iaz))
hasInventory := false
if collectedData[i].Available > 0 {
hasInventory = true
}
r.data.Set(key, hasInventory)
}
return nil
}
Note: Since the code is not runnable just updated methods for reference and I have not include mutex lock for updating the slice you may need to handle for the case.
The same can be achieved with just 3 functions - detect, read, load, detect will check for new files by interval and push to delta channel if found any, load will get file path to read from delta channel and call read method to get the data and error then checks if no error then clear the map and update with new content else log the error, so you would have 2 go routines and 1 function which would be called by load routine
package main
import (
"fmt"
"time"
"os"
"os/signal"
"math/rand"
)
func main() {
fmt.Println(">>>", center("STARTED", 30), "<<<")
c := &Config{
InitialPath: "Old Path",
DetectInterval: 3000,
}
c.start()
fmt.Println(">>>", center("ENDED", 30), "<<<")
}
// https://stackoverflow.com/questions/41133006/how-to-fmt-printprint-this-on-the-center
func center(s string, w int) string {
return fmt.Sprintf("%[1]*s", -w, fmt.Sprintf("%[1]*s", (w + len(s))/2, s))
}
type Config struct {
deltaCh chan string
ticker *time.Ticker
stopSignal chan os.Signal
InitialPath string
DetectInterval time.Duration
}
func (c *Config) start() {
c.stopSignal = make(chan os.Signal, 1)
signal.Notify(c.stopSignal, os.Interrupt)
c.ticker = time.NewTicker(c.DetectInterval * time.Millisecond)
c.deltaCh = make(chan string, 1)
go c.detect()
go c.load()
if c.InitialPath != "" {
c.deltaCh <- c.InitialPath
}
<- c.stopSignal
c.ticker.Stop()
}
// Detect New Files
func (c *Config) detect() {
for {
select {
case <- c.stopSignal:
return
case <- c.ticker.C:
fmt.Println(">>>", center("DETECT", 30), "<<<")
c.deltaCh <- fmt.Sprintf("PATH %f", rand.Float64() * 1.5)
}
}
}
// Read Files
func read(path string) (map[string]int, error) {
data := make(map[string]int)
data[path] = 0
fmt.Println(">>>", center("READ", 30), "<<<")
fmt.Println(path)
return data, nil
}
// Load Files
func (c *Config) load() {
for {
select {
case <- c.stopSignal:
return
case path := <- c.deltaCh:
fmt.Println(">>>", center("LOAD", 30), "<<<")
data, err := read(path)
if err != nil {
fmt.Println("Log Error")
} else {
fmt.Println("Success", data)
}
fmt.Println()
}
}
}
Note: Not included map in sample code it can be easily updated to include map
Just allocate new one map. Like this:
var mu sync.Mutex
before := map[string]string{} // Some map before reading
after := make(map[string]string)
// Read files and fill `after` map
mu.Lock()
before = after
mu.Unlock()
Instead of clearing the map in loadFile method, do something like this in read
func (r *customerConfig) read(file string, bucket string) error {
m := cmap.New() // create a new map
// ...
for {
rows, err := pr.ReadByNumber(r.cfg.RowsToRead)
if err != nil {
return errs.Wrap(err)
}
if len(rows) <= 0 {
break
}
byteSlice, err := json.Marshal(rows)
if err != nil {
return errs.Wrap(err)
}
var invMods []CompModel
err = json.Unmarshal(byteSlice, &invMods)
if err != nil {
return errs.Wrap(err)
}
for i := range invMods {
key := strconv.FormatInt(invMods[i].ProductID, 10) + ":" + strconv.Itoa(int(invMods[i].Iaz))
hasInventory := false
if invMods[i].Available > 0 {
hasInventory = true
}
m.Set(key, hasInventory)
}
}
r.data = m // Use the new map
return nil
}
I'm new to go and have been using split to my advantage. Recently I came across a problem I wanted to split something, and keep the splitting char in my second slice rather than removing it, or leaving it in the first slice as with SplitAfter.
For example the following code:
strings.Split("email#email.com", "#")
returned: ["email", "email.com"]
strings.SplitAfter("email#email.com", "#")
returned: ["email#", "email.com"]
What's the best way to get ["email", "#email.com"]?
Use strings.Index to find the # and slice to get the two parts:
var part1, part2 string
if i := strings.Index(s, "#"); i >= 0 {
part1, part2 = s[:i], s[i:]
} else {
// handle case with no #
}
Run it on the playground.
Could this work for you?
s := strings.Split("email#email.com", "#")
address, domain := s[0], "#"+s[1]
fmt.Println(address, domain)
// email #email.com
Then combing and creating a string
var buffer bytes.Buffer
buffer.WriteString(address)
buffer.WriteString(domain)
result := buffer.String()
fmt.Println(result)
// email#email.com
You can use bufio.Scanner:
package main
import (
"bufio"
"strings"
)
func email(data []byte, eof bool) (int, []byte, error) {
for i, b := range data {
if b == '#' {
if i > 0 {
return i, data[:i], nil
}
return len(data), data, nil
}
}
return 0, nil, nil
}
func main() {
s := bufio.NewScanner(strings.NewReader("email#email.com"))
s.Split(email)
for s.Scan() {
println(s.Text())
}
}
https://golang.org/pkg/bufio#Scanner.Split
I'm going to develop a simple TCP client and server and I want to achieve high throughput (300000 Requests/Second) which is easy to reach with Cpp or C TCP client and server on a server hardware. I mean a server with 48 Cores and 64G Memory.
On my testbed, both client and server have 10G network interface card and I have receive-side-scaling at server side and transmit-packet-steering enabled at the client.
I configure the client to send 10 thousand requests per second. I just run multiple instances of Go go run client.go from a bash script to increase the throughput. However, in this way, Go is going to create lots of threads at the operating systems and a large number of threads results in high context switching cost, and I could not approach such throughputs. I suspected the number of Go instances I'm running from the command line. The code below is the code snippet for the client in the approach:
func Main(cmd_rate_int int, cmd_port string) {
//runtime.GOMAXPROCS(2) // set maximum number of processes to be used by this applications
//var rate float64 = float64(rate_int)
rate := float64(cmd_rate_int)
port = cmd_port
conn, err := net.Dial("tcp", port)
if err != nil {
fmt.Println("ERROR", err)
os.Exit(1)
}
var my_random_number float64 = nextTime(rate) * 1000000
var my_random_int int = int(my_random_number)
var int_message int64 = time.Now().UnixNano()
byte_message := make([]byte, 8)
go func(conn net.Conn) {
buf := make([]byte, 8)
for true {
_, err = io.ReadFull(conn, buf)
now := time.Now().UnixNano()
if err != nil {
return
}
last := int64(binary.LittleEndian.Uint64(buf))
fmt.Println((now - last) / 1000)
}
return
}(conn)
for true {
my_random_number = nextTime(rate) * 1000000
my_random_int = int(my_random_number)
time.Sleep(time.Microsecond * time.Duration(my_random_int))
int_message = time.Now().UnixNano()
binary.LittleEndian.PutUint64(byte_message, uint64(int_message))
conn.Write(byte_message)
}
}
So I try to run all my Go threads by calling go client() in the main so I do not run multiple instances in the Linux command line. I thought it may be a better idea. And it is really a better idea basically and the number of threads doesn't increase toward 700 or so in the operating system. But the throughput still is low and it seems it doesn't employ all capability of the underlying hardware. Actually, you may want to see the code I have run in the second approach:
func main() {
//runtime.GOMAXPROCS(2) // set maximum number of processes to be used by this applications
args := os.Args[1:]
rate_int, _ := strconv.Atoi(args[0])
client_size, _ := strconv.Atoi(args[1])
port := args[2]
i := 0
for i <= client_size {
go client.Main(rate_int, port)
i = i + 1
}
for true {
}
}
I was wondering what is the best practice for in order to reach high throughput? I have always heard that Go is lightweight and performant and pretty comparable with C/Cpp pthread. However, I think in terms of performance still C/Cpp is far far better than Go. I might do something really wrong on this issue, so I would be happy if anybody can help to achieve high throughput with Go.
this is a quick rework of the op code.
As the original source code is working, it does not provide a solution, however it illustrates bucket token usage, and few other small go tips.
It does re use similar default values as op source code.
It demonstrates you do not need two files / programs, to provide both client and server.
It demonstrates usage of flag package.
It shows how to parse unix nano timestamp appropriately using time.Unix(x,y)
It shows how to take advantage of io.Copy to write-what-you-read on the same net.Conn. Rather than manual writing.
Still, this is improper for production delivery.
package main
import (
"encoding/binary"
"flag"
"fmt"
"io"
"log"
"math"
"math/rand"
"net"
"os"
"sync/atomic"
"time"
"github.com/juju/ratelimit"
)
var total_rcv int64
func main() {
var cmd_rate_int float64
var cmd_port string
var client_size int
flag.Float64Var(&cmd_rate_int, "rate", 400000, "change rate of message reading")
flag.StringVar(&cmd_port, "port", ":9090", "port to listen")
flag.IntVar(&client_size, "size", 20, "number of clients")
flag.Parse()
t := flag.Arg(0)
if t == "server" {
server(cmd_port)
} else if t == "client" {
for i := 0; i < client_size; i++ {
go client(cmd_rate_int, cmd_port)
}
// <-make(chan bool) // infinite wait.
<-time.After(time.Second * 2)
fmt.Println("total exchanged", total_rcv)
} else if t == "client_ratelimit" {
bucket := ratelimit.NewBucketWithQuantum(time.Second, int64(cmd_rate_int), int64(cmd_rate_int))
for i := 0; i < client_size; i++ {
go clientRateLimite(bucket, cmd_port)
}
// <-make(chan bool) // infinite wait.
<-time.After(time.Second * 3)
fmt.Println("total exchanged", total_rcv)
}
}
func server(cmd_port string) {
ln, err := net.Listen("tcp", cmd_port)
if err != nil {
panic(err)
}
for {
conn, err := ln.Accept()
if err != nil {
panic(err)
}
go io.Copy(conn, conn)
}
}
func client(cmd_rate_int float64, cmd_port string) {
conn, err := net.Dial("tcp", cmd_port)
if err != nil {
log.Println("ERROR", err)
os.Exit(1)
}
defer conn.Close()
go func(conn net.Conn) {
buf := make([]byte, 8)
for {
_, err := io.ReadFull(conn, buf)
if err != nil {
break
}
// int_message := int64(binary.LittleEndian.Uint64(buf))
// t2 := time.Unix(0, int_message)
// fmt.Println("ROUDNTRIP", time.Now().Sub(t2))
atomic.AddInt64(&total_rcv, 1)
}
return
}(conn)
byte_message := make([]byte, 8)
for {
wait := time.Microsecond * time.Duration(nextTime(cmd_rate_int))
if wait > 0 {
time.Sleep(wait)
fmt.Println("WAIT", wait)
}
int_message := time.Now().UnixNano()
binary.LittleEndian.PutUint64(byte_message, uint64(int_message))
_, err := conn.Write(byte_message)
if err != nil {
log.Println("ERROR", err)
return
}
}
}
func clientRateLimite(bucket *ratelimit.Bucket, cmd_port string) {
conn, err := net.Dial("tcp", cmd_port)
if err != nil {
log.Println("ERROR", err)
os.Exit(1)
}
defer conn.Close()
go func(conn net.Conn) {
buf := make([]byte, 8)
for {
_, err := io.ReadFull(conn, buf)
if err != nil {
break
}
// int_message := int64(binary.LittleEndian.Uint64(buf))
// t2 := time.Unix(0, int_message)
// fmt.Println("ROUDNTRIP", time.Now().Sub(t2))
atomic.AddInt64(&total_rcv, 1)
}
return
}(conn)
byte_message := make([]byte, 8)
for {
bucket.Wait(1)
int_message := time.Now().UnixNano()
binary.LittleEndian.PutUint64(byte_message, uint64(int_message))
_, err := conn.Write(byte_message)
if err != nil {
log.Println("ERROR", err)
return
}
}
}
func nextTime(rate float64) float64 {
return -1 * math.Log(1.0-rand.Float64()) / rate
}
Edit This is a pretty bad answer. Check mh-cbon comments for the reasons.
I don't fully understand how you're trying to do so, but if I want to control the rate on Go, I usually do 2 nested for loops:
for ;; time.Sleep(time.Second) {
go func (){
for i:=0; i<rate; i++ {
go func (){
// Do whatever
}()
}
}()
}
I'm starting a goroutine inside each loop to:
on the outer loop, to ensure it's only 1 second between iterations
on the inner loop, to ensure I can start all the requests I want
Putting this on a problem like yours, it would look something like:
package main
import (
"net"
"os"
"time"
)
const (
rate = 100000
address = "localhost:8090"
)
func main() {
conn, err := net.Dial("tcp", address)
if err != nil {
os.Stderr.Write([]byte(err.Error() + "\n"))
os.Exit(1)
}
for ; err == nil; time.Sleep(time.Second) {
go func() {
for i := 0; i < rate; i++ {
go func(conn net.Conn) {
if _, err := conn.Write([]byte("01234567")); err != nil {
os.Stderr.Write([]byte("\nConnection closed: " + err.Error() + "\n"))
}
}(conn)
}
}()
}
}
To verify that this is actually sending the target request rate, you can have a test TCP listener like this:
package main
import (
"fmt"
"net"
"os"
"time"
)
const (
address = ":8090"
payloadSize = 8
)
func main() {
count := 0
b := make([]byte, payloadSize)
l, err := net.Listen("tcp", address)
if err != nil {
fmt.Fprintf(os.Stdout, "\nCan't listen to address %v: %v\n", address, err)
return
}
defer l.Close()
go func() {
for ; ; time.Sleep(time.Second) {
fmt.Fprintf(os.Stdout, "\rRate: %v/s ", count)
count = 0
}
}()
for {
conn, err := l.Accept()
if err != nil {
fmt.Fprintf(os.Stderr, "\nFailed to accept connection: %v\n", err)
}
for {
_, err := conn.Read(b)
if err != nil {
fmt.Fprintf(os.Stderr, "\nConnection closed: %v\n", err)
break
}
count = count + 1
}
}
}
I found some issues due to not being able to write concurrently into the connection with an error inconsistent fdMutex. This is due to reaching over 0xfffff concurrent writes, which fdMutex does not support. To mitigate this issue, make sure you don't go over that number of concurrent writes. In my system, it was >100k/s. This is not the 300k/s you're expecting, but my system is not prepared for that.
here is my code and I don't understand why the decode function doesn't work.
Little insight would be great please.
func EncodeB64(message string) (retour string) {
base64Text := make([]byte, base64.StdEncoding.EncodedLen(len(message)))
base64.StdEncoding.Encode(base64Text, []byte(message))
return string(base64Text)
}
func DecodeB64(message string) (retour string) {
base64Text := make([]byte, base64.StdEncoding.DecodedLen(len(message)))
base64.StdEncoding.Decode(base64Text, []byte(message))
fmt.Printf("base64: %s\n", base64Text)
return string(base64Text)
}
It gaves me :
[Decode error - output not utf-8][Decode error - output not utf-8]
The len prefix is superficial and causes the invalid utf-8 error:
package main
import (
"encoding/base64"
"fmt"
"log"
)
func main() {
str := base64.StdEncoding.EncodeToString([]byte("Hello, playground"))
fmt.Println(str)
data, err := base64.StdEncoding.DecodeString(str)
if err != nil {
log.Fatal("error:", err)
}
fmt.Printf("%q\n", data)
}
(Also here)
Output
SGVsbG8sIHBsYXlncm91bmQ=
"Hello, playground"
EDIT: I read too fast, the len was not used as a prefix. dystroy got it right.
DecodedLen returns the maximal length.
This length is useful for sizing your buffer but part of the buffer won't be written and thus won't be valid UTF-8.
You have to use only the real written length returned by the Decode function.
l, _ := base64.StdEncoding.Decode(base64Text, []byte(message))
log.Printf("base64: %s\n", base64Text[:l])
To sum up the other two posts, here are two simple functions to encode/decode Base64 strings with Go:
// Dont forget to import "encoding/base64"!
func base64Encode(str string) string {
return base64.StdEncoding.EncodeToString([]byte(str))
}
func base64Decode(str string) (string, bool) {
data, err := base64.StdEncoding.DecodeString(str)
if err != nil {
return "", true
}
return string(data), false
}
Try it!
#Denys Séguret's answer is almost 100% correct. As an improvement to avoid wasting memory with non used space in base64Text, you should use base64.DecodedLen. Take a look at how base64.DecodeString uses it.
It should look like this:
func main() {
message := base64.StdEncoding.EncodeToString([]byte("Hello, playground"))
base64Text := make([]byte, base64.StdEncoding.DecodedLen(len(message)))
n, _ := base64.StdEncoding.Decode(base64Text, []byte(message))
fmt.Println("base64Text:", string(base64Text[:n]))
}
Try it here.
More or less like above, but using []bytes and part of a bigger struct:
func (s secure) encodePayload(body []byte) string {
//Base64 Encode
return base64.StdEncoding.EncodeToString(body)
}
func (s secure) decodePayload(body []byte) ([]byte, error) {
//Base64 Decode
b64 := make([]byte, base64.StdEncoding.DecodedLen(len(body)))
n, err := base64.StdEncoding.Decode(b64, body)
if err != nil {
return nil, err
}
return b64[:n], nil
}
So I am trying to make a program in GO to take a text file full of code and convert that into GO code and then save that file into a GO file or text file. I have been trying to figure out how to save the changes I made to the text file, but the only way I can see the changes is through a println statement because I am using strings.replace to search the string array that the text file is stored in and change each occurrence of a word that needs to be changed (ex. BEGIN -> { and END -> }). So is there any other way of searching and replacing in GO I don't know about or is there a way to edit a text file that I don't know about or is this impossible?
Thanks
Here is the code I have so far.
package main
import (
"os"
"bufio"
"bytes"
"io"
"fmt"
"strings"
)
func readLines(path string) (lines []string, errr error) {
var (
file *os.File
part []byte
prefix bool
)
if file, errr = os.Open(path); errr != nil {
return
}
defer file.Close()
reader := bufio.NewReader(file)
buffer := bytes.NewBuffer(make([]byte, 0))
for {
if part, prefix, errr = reader.ReadLine(); errr != nil {
break
}
buffer.Write(part)
if !prefix {
lines = append(lines, buffer.String())
buffer.Reset()
}
}
if errr == io.EOF {
errr = nil
}
return
}
func writeLines(lines []string, path string) (errr error) {
var (
file *os.File
)
if file, errr = os.Create(path); errr != nil {
return
}
defer file.Close()
for _,item := range lines {
_, errr := file.WriteString(strings.TrimSpace(item) + "\n");
if errr != nil {
fmt.Println(errr)
break
}
}
return
}
func FixBegin(lines []string) (errr error) {
var(
a string
)
for i := 0; ; i++ {
a = lines[i];
fmt.Println(strings.Replace(a, "BEGIN", "{", -1))
}
return
}
func FixEnd(lines []string) (errr error) {
var(
a string
)
for i := 0; ; i++ {
a = lines[i];
fmt.Println(strings.Replace(a, "END", "}", -1))
}
return
}
func main() {
lines, errr := readLines("foo.txt")
if errr != nil {
fmt.Println("Error: %s\n", errr)
return
}
for _, line := range lines {
fmt.Println(line)
}
errr = FixBegin(lines)
errr = writeLines(lines, "beer2.txt")
fmt.Println(errr)
errr = FixEnd(lines)
lines, errr = readLines("beer2.txt")
if errr != nil {
fmt.Println("Error: %s\n", errr)
return
}
errr = writeLines(lines, "beer2.txt")
fmt.Println(errr)
}
jnml#fsc-r630:~/src/tmp/SO/13789882$ ls
foo.txt main.go
jnml#fsc-r630:~/src/tmp/SO/13789882$ cat main.go
package main
import (
"bytes"
"io/ioutil"
"log"
)
func main() {
src, err := ioutil.ReadFile("foo.txt")
if err != nil {
log.Fatal(err)
}
src = bytes.Replace(src, []byte("BEGIN"), []byte("{"), -1)
src = bytes.Replace(src, []byte("END"), []byte("}"), -1)
if err = ioutil.WriteFile("beer2.txt", src, 0666); err != nil {
log.Fatal(err)
}
}
jnml#fsc-r630:~/src/tmp/SO/13789882$ cat foo.txt
BEGIN
FILE F(KIND=REMOTE);
EBCDIC ARRAY E[0:11];
REPLACE E BY "HELLO WORLD!";
WRITE(F, *, E);
END.
jnml#fsc-r630:~/src/tmp/SO/13789882$ go run main.go
jnml#fsc-r630:~/src/tmp/SO/13789882$ cat beer2.txt
{
FILE F(KIND=REMOTE);
EBCDIC ARRAY E[0:11];
REPLACE E BY "HELLO WORLD!";
WRITE(F, *, E);
}.
jnml#fsc-r630:~/src/tmp/SO/13789882$
I agree with #jnml wrt using ioutil to slurp the file and to write it back. But I think that the replacing shouldn't be done by multiple passes over []byte. Code and data are strings/text and should be treated as such (even if dealing with non ascii/utf8 encodings requires estra work); a one pass replacement (of all placeholders 'at once') avoids the risk of replacing results of previous changes (even if my regexp proposal must be improved to handle non-trivial tasks).
package main
import(
"fmt"
"io/ioutil"
"log"
"regexp"
"strings"
)
func main() {
// (1) slurp the file
data, err := ioutil.ReadFile("../tmpl/xpl.go")
if err != nil {
log.Fatal("ioutil.ReadFile: ", err)
}
s := string(data)
fmt.Printf("----\n%s----\n", s)
// => function that works for files of (known) other encodings that ascii or utf8
// (2) create a map that maps placeholder to be replaced to the replacements
x := map[string]string {
"BEGIN" : "{",
"END" : "}"}
ks := make([]string, 0, len(x))
for k := range x {
ks = append(ks, k)
}
// => function(s) that gets the keys from maps
// (3) create a regexp that finds the placeholder to be replaced
p := strings.Join(ks, "|")
fmt.Printf("/%s/\n", p)
r := regexp.MustCompile(p)
// => funny letters & order need more consideration
// (4) create a callback function for ..ReplaceAllStringFunc that knows
// about the map x
f := func(s string) string {
fmt.Printf("*** '%s'\n", s)
return x[s]
}
// => function (?) to do Step (2) .. (4) in a reusable way
// (5) do the replacing (s will be overwritten with the result)
s = r.ReplaceAllStringFunc(s, f)
fmt.Printf("----\n%s----\n", s)
// (6) write back
err = ioutil.WriteFile("result.go", []byte(s), 0644)
if err != nil {
log.Fatal("ioutil.WriteFile: ", err)
}
// => function that works for files of (known) other encodings that ascii or utf8
}
output:
go run 13789882.go
----
func main() BEGIN
END
----
/BEGIN|END/
*** 'BEGIN'
*** 'END'
----
func main() {
}
----
If your file size is huge, reading everything in memory might not be possible nor advised. Give BytesReplacingReader a try as it is done replacement in streaming fashion. And it's reasonably performant. If you want to replace two strings (such as BEGIN -> { and END -> }), just need to wrap two BytesReplacingReader over original reader, one for BEGIN and one for END:
r := NewBytesReplacingReader(
NewBytesReplacingReader(inputReader, []byte("BEGIN"), []byte("{"),
[]byte("END"), []byte("}")
// use r normally and all non-overlapping occurrences of
// "BEGIN" and "END" will be replaced with "{" and "}"