currently i have a scenario where i have huge file (for example im going to say 500k lines of text) and the idea is to use worker (threads) to process them by 100 each thread. after running my code, i still wonder why the goroutines consume the same line more than once? im guessing it's racing to get the job done.
here's my code
package main
import (
"log"
"bufio"
"fmt"
"encoding/csv"
"encoding/json"
"io"
"os"
"sync"
)
type IMDBDataModel struct {
Color string `json:"color"`
DirectorName string `json:"director_name"`
NumCriticForReviews string `json:"num_critic_for_reviews"`
Duration string `json:"duration"`
DirectorFacebookLikes string `json:"director_facebook_likes"`
Actor3FacebookLikes string `json:"actor_3_facebook_likes"`
Actor2Name string `json:"actor_2_name"`
Actor1FacebookLikes string `json:"actor_1_facebook_likes"`
Gross string `json:"gross"`
Genre string `json:"genres"`
Actor1Name string `json:"actor_1_name"`
MovieTitle string `json:"movie_title"`
NumVotedUser string `json:"num_voted_users"`
CastTotalFacebookLikes string `json:"cast_total_facebook_likes"`
Actor3Name string `json:"actor_3_name"`
FaceNumberInPoster string `json:"facenumber_in_poster"`
PlotKeywords string `json:"plot_keywords"`
MovieIMDBLink string `json:"movie_imdb_link"`
NumUserForReviews string `json:"num_user_for_reviews"`
Language string `json:"language"`
Country string `json:"country"`
ContentRating string `json:"content_rating"`
Budget string `json:"budget"`
TitleYear string `json:"title_year"`
Actor2FacebookLikes string `json:"actor_2_facebook_likes"`
IMDBScore string `json:"imdb_score"`
AspectRatio string `json:"aspect_ratio"`
MovieFacebookLikes string `json:"movie_facebook_likes"`
}
var iterated int64
var out []*IMDBDataModel
func populateString(input []IMDBDataModel, out []*IMDBDataModel, wg *sync.WaitGroup) {
for _ , data := range input {
out = append(out, &data)
}
wg.Done()
}
func consumeData(input <-chan *IMDBDataModel, wg *sync.WaitGroup){
defer wg.Done()
for data := range input {
iterated++
fmt.Printf("%d : %s\n", iterated, data.MovieTitle)
out = append(out, data)
}
fmt.Println("output size : ", len(out))
}
func processCSV(path string) (imdbList []IMDBDataModel){
csvFile, _ := os.Open(path)
reader := csv.NewReader(bufio.NewReader(csvFile))
for {
line, error := reader.Read()
if error == io.EOF {
break
} else if error != nil {
log.Fatal(error)
}
imdbList = append(imdbList,
IMDBDataModel{
Color: line[0],
DirectorName: line[1],
NumCriticForReviews : line[2],
Duration: line[3],
DirectorFacebookLikes: line[4],
Actor3FacebookLikes: line[5],
Actor2Name: line[6],
Actor1FacebookLikes: line[7],
Gross: line[8],
Genre: line[9],
Actor1Name: line[10],
MovieTitle: line[11],
NumVotedUser: line[12],
CastTotalFacebookLikes: line[13],
Actor3Name: line[14],
FaceNumberInPoster: line[15],
PlotKeywords: line[16],
MovieIMDBLink: line[17],
NumUserForReviews: line[18],
Language: line[19],
Country: line[20],
ContentRating: line[21],
Budget: line[22],
TitleYear: line[23],
Actor2FacebookLikes: line[24],
IMDBScore: line[25],
AspectRatio: line[26],
MovieFacebookLikes: line[27],
},
)
}
imdbJson, err := json.Marshal(imdbList)
if err != nil {
log.Println(imdbJson)
}
return
}
func main() {
imdbList := processCSV("movie_metadata.csv")
imdbChannel := make(chan *IMDBDataModel, 100) // buffer
var wg sync.WaitGroup
for i := 0; i < 5;i++ {
wg.Add(1)
go consumeData(imdbChannel,&wg)
}
for _ ,task := range imdbList {
imdbChannel <- &task
}
close(imdbChannel)
wg.Wait()
// for _, item := range out {
// fmt.Println(item.MovieTitle)
// }
fmt.Println("Total Channel :", len(imdbChannel))
fmt.Println("Total IMDB :", len(imdbList))
fmt.Println("Total Data: ", len(out))
fmt.Println("Iterated : ", iterated)
fmt.Println("Goroutines finished..")
}
EDITED:
after few suggestions on adding mutex and another channel, this is the modified consume function
func consumeData(input <-chan *IMDBDataModel, output chan *IMDBDataModel, wg *sync.WaitGroup) {
defer wg.Done()
for data := range input {
iterated++
// outLock.Lock()
// out = append(out, data)
// outLock.Unlock()
output <- data
}
}
however still consuming the same line (race occured) more than once.
....
My Date with Drew
My Date with Drew
My Date with Drew
My Date with Drew
My Date with Drew
Total Channel : 0
Total IMDB : 5044
Total Data: 4944
Iterated : 5000
Goroutines finished..
You issues is with:
var out []*IMDBDataModel
func consumeData(input <-chan *IMDBDataModel, wg *sync.WaitGroup){
defer wg.Done()
for data := range input {
iterated++
fmt.Printf("%d : %s\n", iterated, data.MovieTitle)
out = append(out, data)
}
fmt.Println("output size : ", len(out))
}
You are appending to "out" from multiple threads:
try adding a lock around the places you write to "out" like this:
var out []*IMDBDataModel
var outLock sync.Mutex
func consumeData(input <-chan *IMDBDataModel, wg *sync.WaitGroup){
defer wg.Done()
for data := range input {
iterated++
fmt.Printf("%d : %s\n", iterated, data.MovieTitle)
outLock.Lock()
out = append(out, &data)
outLock.Unlock()
}
outLock.Lock()
fmt.Println("output size : ", len(out))
outLock.Unlock()
}
Related
I have the following setup to parse a csv file:
package main
import (
"fmt"
"os"
"encoding/csv"
)
type CsvLine struct {
Id string
Array1 [] string
Array2 [] string
}
func ReadCsv(filename string) ([][]string, error) {
f, err := os.Open(filename)
if err != nil {
return [][]string{}, err
}
defer f.Close()
lines, err := csv.NewReader(f).ReadAll()
if err != nil {
return [][]string{}, err
}
return lines, nil
}
func main() {
lines, err := ReadCsv("./data/sample-0.3.csv")
if err != nil {
panic(err)
}
for _, line := range lines {
fmt.Println(line)
data := CsvLine{
Id: line[0],
Array1: line[1],
Array2: line[2],
}
fmt.Println(data.Id)
fmt.Println(data.Array1)
fmt.Println(data.Array2)
}
}
And the following setup in my csv file:
594385903dss,"['fhjdsk', 'dfjdskl', 'fkdsjgooiertio']","['jflkdsjfl', 'fkjdlsfjdslkfjldks']"
87764385903dss,"['cxxc', 'wqeewr', 'opi', 'iy', 'qw']","['cvbvc', 'gf', 'mnb', 'ewr']"
My understanding is that variable length lists should be parsed into a slice, is it possible to do this directly via a csv reader? (The csv output was generated via a python project.)
Help/suggestions appreciated.
CSV does not have a notion of "variable length arrays", it is just a comma separated list of values. The format is described in RFC 4180, and that is exactly what the encoding/csv package implements.
You can only get a string slice out of a CSV line. How you interpret the values is up to you. You have to post process your data if you want to split it further.
What you have may be simply processed with the regexp package, e.g.
var r = regexp.MustCompile(`'[^']*'`)
func split(s string) []string {
parts := r.FindAllString(s, -1)
for i, part := range parts {
parts[i] = part[1 : len(part)-1]
}
return parts
}
Testing it:
s := `['one', 'two', 'three']`
fmt.Printf("%q\n", split(s))
s = `[]`
fmt.Printf("%q\n", split(s))
s = `['o,ne', 't,w,o', 't,,hree']`
fmt.Printf("%q\n", split(s))
Output (try it on the Go Playground):
["one" "two" "three"]
[]
["o,ne" "t,w,o" "t,,hree"]
Using this split() function, this is how processing may look like:
for _, line := range lines {
data := CsvLine{
Id: line[0],
Array1: split(line[1]),
Array2: split(line[2]),
}
fmt.Printf("%+v\n", data)
}
This outputs (try it on the Go Playground):
{Id:594385903dss Array1:[fhjdsk dfjdskl fkdsjgooiertio] Array2:[jflkdsjfl fkjdlsfjdslkfjldks]}
{Id:87764385903dss Array1:[cxxc wqeewr opi iy qw] Array2:[cvbvc gf mnb ewr]}
I'm new to go and have been using split to my advantage. Recently I came across a problem I wanted to split something, and keep the splitting char in my second slice rather than removing it, or leaving it in the first slice as with SplitAfter.
For example the following code:
strings.Split("email#email.com", "#")
returned: ["email", "email.com"]
strings.SplitAfter("email#email.com", "#")
returned: ["email#", "email.com"]
What's the best way to get ["email", "#email.com"]?
Use strings.Index to find the # and slice to get the two parts:
var part1, part2 string
if i := strings.Index(s, "#"); i >= 0 {
part1, part2 = s[:i], s[i:]
} else {
// handle case with no #
}
Run it on the playground.
Could this work for you?
s := strings.Split("email#email.com", "#")
address, domain := s[0], "#"+s[1]
fmt.Println(address, domain)
// email #email.com
Then combing and creating a string
var buffer bytes.Buffer
buffer.WriteString(address)
buffer.WriteString(domain)
result := buffer.String()
fmt.Println(result)
// email#email.com
You can use bufio.Scanner:
package main
import (
"bufio"
"strings"
)
func email(data []byte, eof bool) (int, []byte, error) {
for i, b := range data {
if b == '#' {
if i > 0 {
return i, data[:i], nil
}
return len(data), data, nil
}
}
return 0, nil, nil
}
func main() {
s := bufio.NewScanner(strings.NewReader("email#email.com"))
s.Split(email)
for s.Scan() {
println(s.Text())
}
}
https://golang.org/pkg/bufio#Scanner.Split
I'm going to develop a simple TCP client and server and I want to achieve high throughput (300000 Requests/Second) which is easy to reach with Cpp or C TCP client and server on a server hardware. I mean a server with 48 Cores and 64G Memory.
On my testbed, both client and server have 10G network interface card and I have receive-side-scaling at server side and transmit-packet-steering enabled at the client.
I configure the client to send 10 thousand requests per second. I just run multiple instances of Go go run client.go from a bash script to increase the throughput. However, in this way, Go is going to create lots of threads at the operating systems and a large number of threads results in high context switching cost, and I could not approach such throughputs. I suspected the number of Go instances I'm running from the command line. The code below is the code snippet for the client in the approach:
func Main(cmd_rate_int int, cmd_port string) {
//runtime.GOMAXPROCS(2) // set maximum number of processes to be used by this applications
//var rate float64 = float64(rate_int)
rate := float64(cmd_rate_int)
port = cmd_port
conn, err := net.Dial("tcp", port)
if err != nil {
fmt.Println("ERROR", err)
os.Exit(1)
}
var my_random_number float64 = nextTime(rate) * 1000000
var my_random_int int = int(my_random_number)
var int_message int64 = time.Now().UnixNano()
byte_message := make([]byte, 8)
go func(conn net.Conn) {
buf := make([]byte, 8)
for true {
_, err = io.ReadFull(conn, buf)
now := time.Now().UnixNano()
if err != nil {
return
}
last := int64(binary.LittleEndian.Uint64(buf))
fmt.Println((now - last) / 1000)
}
return
}(conn)
for true {
my_random_number = nextTime(rate) * 1000000
my_random_int = int(my_random_number)
time.Sleep(time.Microsecond * time.Duration(my_random_int))
int_message = time.Now().UnixNano()
binary.LittleEndian.PutUint64(byte_message, uint64(int_message))
conn.Write(byte_message)
}
}
So I try to run all my Go threads by calling go client() in the main so I do not run multiple instances in the Linux command line. I thought it may be a better idea. And it is really a better idea basically and the number of threads doesn't increase toward 700 or so in the operating system. But the throughput still is low and it seems it doesn't employ all capability of the underlying hardware. Actually, you may want to see the code I have run in the second approach:
func main() {
//runtime.GOMAXPROCS(2) // set maximum number of processes to be used by this applications
args := os.Args[1:]
rate_int, _ := strconv.Atoi(args[0])
client_size, _ := strconv.Atoi(args[1])
port := args[2]
i := 0
for i <= client_size {
go client.Main(rate_int, port)
i = i + 1
}
for true {
}
}
I was wondering what is the best practice for in order to reach high throughput? I have always heard that Go is lightweight and performant and pretty comparable with C/Cpp pthread. However, I think in terms of performance still C/Cpp is far far better than Go. I might do something really wrong on this issue, so I would be happy if anybody can help to achieve high throughput with Go.
this is a quick rework of the op code.
As the original source code is working, it does not provide a solution, however it illustrates bucket token usage, and few other small go tips.
It does re use similar default values as op source code.
It demonstrates you do not need two files / programs, to provide both client and server.
It demonstrates usage of flag package.
It shows how to parse unix nano timestamp appropriately using time.Unix(x,y)
It shows how to take advantage of io.Copy to write-what-you-read on the same net.Conn. Rather than manual writing.
Still, this is improper for production delivery.
package main
import (
"encoding/binary"
"flag"
"fmt"
"io"
"log"
"math"
"math/rand"
"net"
"os"
"sync/atomic"
"time"
"github.com/juju/ratelimit"
)
var total_rcv int64
func main() {
var cmd_rate_int float64
var cmd_port string
var client_size int
flag.Float64Var(&cmd_rate_int, "rate", 400000, "change rate of message reading")
flag.StringVar(&cmd_port, "port", ":9090", "port to listen")
flag.IntVar(&client_size, "size", 20, "number of clients")
flag.Parse()
t := flag.Arg(0)
if t == "server" {
server(cmd_port)
} else if t == "client" {
for i := 0; i < client_size; i++ {
go client(cmd_rate_int, cmd_port)
}
// <-make(chan bool) // infinite wait.
<-time.After(time.Second * 2)
fmt.Println("total exchanged", total_rcv)
} else if t == "client_ratelimit" {
bucket := ratelimit.NewBucketWithQuantum(time.Second, int64(cmd_rate_int), int64(cmd_rate_int))
for i := 0; i < client_size; i++ {
go clientRateLimite(bucket, cmd_port)
}
// <-make(chan bool) // infinite wait.
<-time.After(time.Second * 3)
fmt.Println("total exchanged", total_rcv)
}
}
func server(cmd_port string) {
ln, err := net.Listen("tcp", cmd_port)
if err != nil {
panic(err)
}
for {
conn, err := ln.Accept()
if err != nil {
panic(err)
}
go io.Copy(conn, conn)
}
}
func client(cmd_rate_int float64, cmd_port string) {
conn, err := net.Dial("tcp", cmd_port)
if err != nil {
log.Println("ERROR", err)
os.Exit(1)
}
defer conn.Close()
go func(conn net.Conn) {
buf := make([]byte, 8)
for {
_, err := io.ReadFull(conn, buf)
if err != nil {
break
}
// int_message := int64(binary.LittleEndian.Uint64(buf))
// t2 := time.Unix(0, int_message)
// fmt.Println("ROUDNTRIP", time.Now().Sub(t2))
atomic.AddInt64(&total_rcv, 1)
}
return
}(conn)
byte_message := make([]byte, 8)
for {
wait := time.Microsecond * time.Duration(nextTime(cmd_rate_int))
if wait > 0 {
time.Sleep(wait)
fmt.Println("WAIT", wait)
}
int_message := time.Now().UnixNano()
binary.LittleEndian.PutUint64(byte_message, uint64(int_message))
_, err := conn.Write(byte_message)
if err != nil {
log.Println("ERROR", err)
return
}
}
}
func clientRateLimite(bucket *ratelimit.Bucket, cmd_port string) {
conn, err := net.Dial("tcp", cmd_port)
if err != nil {
log.Println("ERROR", err)
os.Exit(1)
}
defer conn.Close()
go func(conn net.Conn) {
buf := make([]byte, 8)
for {
_, err := io.ReadFull(conn, buf)
if err != nil {
break
}
// int_message := int64(binary.LittleEndian.Uint64(buf))
// t2 := time.Unix(0, int_message)
// fmt.Println("ROUDNTRIP", time.Now().Sub(t2))
atomic.AddInt64(&total_rcv, 1)
}
return
}(conn)
byte_message := make([]byte, 8)
for {
bucket.Wait(1)
int_message := time.Now().UnixNano()
binary.LittleEndian.PutUint64(byte_message, uint64(int_message))
_, err := conn.Write(byte_message)
if err != nil {
log.Println("ERROR", err)
return
}
}
}
func nextTime(rate float64) float64 {
return -1 * math.Log(1.0-rand.Float64()) / rate
}
Edit This is a pretty bad answer. Check mh-cbon comments for the reasons.
I don't fully understand how you're trying to do so, but if I want to control the rate on Go, I usually do 2 nested for loops:
for ;; time.Sleep(time.Second) {
go func (){
for i:=0; i<rate; i++ {
go func (){
// Do whatever
}()
}
}()
}
I'm starting a goroutine inside each loop to:
on the outer loop, to ensure it's only 1 second between iterations
on the inner loop, to ensure I can start all the requests I want
Putting this on a problem like yours, it would look something like:
package main
import (
"net"
"os"
"time"
)
const (
rate = 100000
address = "localhost:8090"
)
func main() {
conn, err := net.Dial("tcp", address)
if err != nil {
os.Stderr.Write([]byte(err.Error() + "\n"))
os.Exit(1)
}
for ; err == nil; time.Sleep(time.Second) {
go func() {
for i := 0; i < rate; i++ {
go func(conn net.Conn) {
if _, err := conn.Write([]byte("01234567")); err != nil {
os.Stderr.Write([]byte("\nConnection closed: " + err.Error() + "\n"))
}
}(conn)
}
}()
}
}
To verify that this is actually sending the target request rate, you can have a test TCP listener like this:
package main
import (
"fmt"
"net"
"os"
"time"
)
const (
address = ":8090"
payloadSize = 8
)
func main() {
count := 0
b := make([]byte, payloadSize)
l, err := net.Listen("tcp", address)
if err != nil {
fmt.Fprintf(os.Stdout, "\nCan't listen to address %v: %v\n", address, err)
return
}
defer l.Close()
go func() {
for ; ; time.Sleep(time.Second) {
fmt.Fprintf(os.Stdout, "\rRate: %v/s ", count)
count = 0
}
}()
for {
conn, err := l.Accept()
if err != nil {
fmt.Fprintf(os.Stderr, "\nFailed to accept connection: %v\n", err)
}
for {
_, err := conn.Read(b)
if err != nil {
fmt.Fprintf(os.Stderr, "\nConnection closed: %v\n", err)
break
}
count = count + 1
}
}
}
I found some issues due to not being able to write concurrently into the connection with an error inconsistent fdMutex. This is due to reaching over 0xfffff concurrent writes, which fdMutex does not support. To mitigate this issue, make sure you don't go over that number of concurrent writes. In my system, it was >100k/s. This is not the 300k/s you're expecting, but my system is not prepared for that.
I am very, very memory careful as I have to write programs that need to cope with massive datasets.
Currently my application quickly reaches 32GB of memory, starts swapping, and then gets killed by the system.
I do not understand how this can be since all variables are collectable (in functions and quickly released) except TokensStruct and TokensCount in the Trainer struct. TokensCount is just a uint. TokensStruct is a 1,000,000 row slice of [5]uint32 and string, so that means 20 bytes + string, which we could call a maximum of 50 bytes per record. 50*1000000 = 50MB of memory required. So this script should therefore not use much more than 50MB + overhead + temporary collectable variables in the functions (maybe another 50MB max.) The maximum potential size of TokensStruct is 5,000,000, as this is the size of dictionary, but even then it would be only 250MB of memory. dictionary is a map and apparently uses around 600MB of memory, as that is how the app starts, but this is not an issue because dictionary is only loaded once and never written to again.
Instead it uses 32GB of memory then dies. By the speed that it does this I expect it would happily get to 1TB of memory if it could. The memory appears to increase in a linear fashion with the size of the files being loaded, meaning that it appears to never clear any memory at all. Everything that enters the app is allocated more memory and memory is never freed.
I tried implementing runtime.GC() in case the garbage collection wasn't running often enough, but this made no difference.
Since the memory usage increases in a linear fashion then this would imply that there is a memory leak in GetTokens() or LoadZip(). I don't know how this could be, since they are both functions and only do one task and then close. Or it could be that the tokens variable in Start() is the cause of the leak. Basically it looks like every file that is loaded and parsed is never released from memory, as that is the only way that the memory could fill up in a linear fashion and keep on rising up to 32GB++.
Absolute nightmare! What's wrong with Go? Any way to fix this?
package main
import (
"bytes"
"code.google.com/p/go.text/transform"
"code.google.com/p/go.text/unicode/norm"
"compress/zlib"
"encoding/gob"
"fmt"
"github.com/AlasdairF/BinSearch"
"io/ioutil"
"os"
"regexp"
"runtime"
"strings"
"unicode"
"unicode/utf8"
)
type TokensStruct struct {
binsearch.Key_string
Value [][5]uint32
}
type Trainer struct {
Tokens TokensStruct
TokensCount uint
}
func checkErr(err error) {
if err == nil {
return
}
fmt.Println(`Some Error:`, err)
panic(err)
}
// Local helper function for normalization of UTF8 strings.
func isMn(r rune) bool {
return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks
}
// This map is used by RemoveAccents function to convert non-accented characters.
var transliterations = map[rune]string{'Æ': "E", 'Ð': "D", 'Ł': "L", 'Ø': "OE", 'Þ': "Th", 'ß': "ss", 'æ': "e", 'ð': "d", 'ł': "l", 'ø': "oe", 'þ': "th", 'Œ': "OE", 'œ': "oe"}
// removeAccentsBytes converts accented UTF8 characters into their non-accented equivalents, from a []byte.
func removeAccentsBytesDashes(b []byte) ([]byte, error) {
mnBuf := make([]byte, len(b))
t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
n, _, err := t.Transform(mnBuf, b, true)
if err != nil {
return nil, err
}
mnBuf = mnBuf[:n]
tlBuf := bytes.NewBuffer(make([]byte, 0, len(mnBuf)*2))
for i, w := 0, 0; i < len(mnBuf); i += w {
r, width := utf8.DecodeRune(mnBuf[i:])
if r == '-' {
tlBuf.WriteByte(' ')
} else {
if d, ok := transliterations[r]; ok {
tlBuf.WriteString(d)
} else {
tlBuf.WriteRune(r)
}
}
w = width
}
return tlBuf.Bytes(), nil
}
func LoadZip(filename string) ([]byte, error) {
// Open file for reading
fi, err := os.Open(filename)
if err != nil {
return nil, err
}
defer fi.Close()
// Attach ZIP reader
fz, err := zlib.NewReader(fi)
if err != nil {
return nil, err
}
defer fz.Close()
// Pull
data, err := ioutil.ReadAll(fz)
if err != nil {
return nil, err
}
return norm.NFC.Bytes(data), nil // return normalized
}
func getTokens(pibn string) []string {
var data []byte
var err error
data, err = LoadZip(`/storedir/` + pibn + `/text.zip`)
checkErr(err)
data, err = removeAccentsBytesDashes(data)
checkErr(err)
data = bytes.ToLower(data)
data = reg2.ReplaceAll(data, []byte("$2")) // remove contractions
data = reg.ReplaceAllLiteral(data, nil)
tokens := strings.Fields(string(data))
return tokens
}
func (t *Trainer) Start() {
data, err := ioutil.ReadFile(`list.txt`)
checkErr(err)
pibns := bytes.Fields(data)
for i, pibn := range pibns {
tokens := getTokens(string(pibn))
t.addTokens(tokens)
if i%100 == 0 {
runtime.GC() // I added this just to try to stop the memory craziness, but it makes no difference
}
}
}
func (t *Trainer) addTokens(tokens []string) {
for _, tok := range tokens {
if _, ok := dictionary[tok]; ok {
if indx, ok2 := t.Tokens.Find(tok); ok2 {
ar := t.Tokens.Value[indx]
ar[0]++
t.Tokens.Value[indx] = ar
t.TokensCount++
} else {
t.Tokens.AddKeyAt(tok, indx)
t.Tokens.Value = append(t.Tokens.Value, [5]uint32{0, 0, 0, 0, 0})
copy(t.Tokens.Value[indx+1:], t.Tokens.Value[indx:])
t.Tokens.Value[indx] = [5]uint32{1, 0, 0, 0, 0}
t.TokensCount++
}
}
}
return
}
func LoadDictionary() {
dictionary = make(map[string]bool)
data, err := ioutil.ReadFile(`dictionary`)
checkErr(err)
words := bytes.Fields(data)
for _, word := range words {
strword := string(word)
dictionary[strword] = false
}
}
var reg = regexp.MustCompile(`[^a-z0-9\s]`)
var reg2 = regexp.MustCompile(`\b(c|l|all|dall|dell|nell|sull|coll|pell|gl|agl|dagl|degl|negl|sugl|un|m|t|s|v|d|qu|n|j)'([a-z])`) //contractions
var dictionary map[string]bool
func main() {
trainer := new(Trainer)
LoadDictionary()
trainer.Start()
}
Make sure that if you're tokenizing from a large string, to avoid memory pinning. From the comments above, it sounds like the tokens are substrings of a large string.
You may need to add a little extra in your getTokens() function so it guarantees the tokens aren't pinning memory.
func getTokens(...) {
// near the end of your program
for i, t := range(tokens) {
tokens[i] = string([]byte(t))
}
}
By the way, reading the whole file into memory using ioutil.ReadFile all at once looks dubious. Are you sure you can't use bufio.Scanner?
I'm looking at the code more closely... if you are truly concerned about memory, take advantage of io.Reader. You should try to avoid sucking in the content of a whole file at once. Use io.Reader and the transform "along the grain". The way you're using it now is against the grain of its intent. The whole point of the transform package you're using is to construct flexible Readers that can stream through data.
For example, here's a simplification of what you're doing:
package main
import (
"bufio"
"bytes"
"fmt"
"unicode/utf8"
"code.google.com/p/go.text/transform"
)
type AccentsTransformer map[rune]string
func (a AccentsTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for nSrc < len(src) {
// If we're at the edge, note this and return.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
return
}
r, width := utf8.DecodeRune(src[nSrc:])
if r == utf8.RuneError && width == 1 {
err = fmt.Errorf("Decoding error")
return
}
if d, ok := a[r]; ok {
if nDst+len(d) > len(dst) {
err = transform.ErrShortDst
return
}
copy(dst[nDst:], d)
nSrc += width
nDst += len(d)
continue
}
if nDst+width > len(dst) {
err = transform.ErrShortDst
return
}
copy(dst[nDst:], src[nSrc:nSrc+width])
nDst += width
nSrc += width
}
return
}
func main() {
transliterations := AccentsTransformer{'Æ': "E", 'Ø': "OE"}
testString := "cØØl beÆns"
b := transform.NewReader(bytes.NewBufferString(testString), transliterations)
scanner := bufio.NewScanner(b)
scanner.Split(bufio.ScanWords)
for scanner.Scan() {
fmt.Println("token:", scanner.Text())
}
}
It becomes really easy then to chain transformers together. So, for example, if we wanted to remove all hyphens from the input stream, it's just a matter of using transform.Chain appropriately:
func main() {
transliterations := AccentsTransformer{'Æ': "E", 'Ø': "OE"}
removeHyphens := transform.RemoveFunc(func(r rune) bool {
return r == '-'
})
allTransforms := transform.Chain(transliterations, removeHyphens)
testString := "cØØl beÆns - the next generation"
b := transform.NewReader(bytes.NewBufferString(testString), allTransforms)
scanner := bufio.NewScanner(b)
scanner.Split(bufio.ScanWords)
for scanner.Scan() {
fmt.Println("token:", scanner.Text())
}
}
I have not exhaustively tested the code above, so please don't just copy-and-paste it without sufficient tests. :P I just cooked it up fast. But this kind of approach --- avoiding whole-file reading --- will scale better because it will read the file in chunks.
1 How large are "list.txt" and "dictionary"? If it is so large, No wonder the memory is so large
pibns := bytes.Fields(data)
how much is len(pibns)?
2 start the gc debug ( do GODEBUG="gctrace=1" ./yourprogram ) to see if there is any gc happening
3 do some profile like this:
func lookupMem(){
if f, err := os.Create("mem_prof"+time.Now.Unix()); err != nil {
log.Debug("record memory profile failed: %v", err)
} else {
runtime.GC()
pprof.WriteHeapProfile(f)
f.Close()
}
if f, err := os.Create("heap_prof" + "." + timestamp); err != nil {
log.Debug("heap profile failed:", err)
} else {
p := pprof.Lookup("heap")
p.WriteTo(f, 2)
}
}
func (t *Trainer) Start() {
.......
if i%1000==0 {
//if `len(pibns)` is not very large , record some meminfo
lookupMem()
}
.......
So I am trying to make a program in GO to take a text file full of code and convert that into GO code and then save that file into a GO file or text file. I have been trying to figure out how to save the changes I made to the text file, but the only way I can see the changes is through a println statement because I am using strings.replace to search the string array that the text file is stored in and change each occurrence of a word that needs to be changed (ex. BEGIN -> { and END -> }). So is there any other way of searching and replacing in GO I don't know about or is there a way to edit a text file that I don't know about or is this impossible?
Thanks
Here is the code I have so far.
package main
import (
"os"
"bufio"
"bytes"
"io"
"fmt"
"strings"
)
func readLines(path string) (lines []string, errr error) {
var (
file *os.File
part []byte
prefix bool
)
if file, errr = os.Open(path); errr != nil {
return
}
defer file.Close()
reader := bufio.NewReader(file)
buffer := bytes.NewBuffer(make([]byte, 0))
for {
if part, prefix, errr = reader.ReadLine(); errr != nil {
break
}
buffer.Write(part)
if !prefix {
lines = append(lines, buffer.String())
buffer.Reset()
}
}
if errr == io.EOF {
errr = nil
}
return
}
func writeLines(lines []string, path string) (errr error) {
var (
file *os.File
)
if file, errr = os.Create(path); errr != nil {
return
}
defer file.Close()
for _,item := range lines {
_, errr := file.WriteString(strings.TrimSpace(item) + "\n");
if errr != nil {
fmt.Println(errr)
break
}
}
return
}
func FixBegin(lines []string) (errr error) {
var(
a string
)
for i := 0; ; i++ {
a = lines[i];
fmt.Println(strings.Replace(a, "BEGIN", "{", -1))
}
return
}
func FixEnd(lines []string) (errr error) {
var(
a string
)
for i := 0; ; i++ {
a = lines[i];
fmt.Println(strings.Replace(a, "END", "}", -1))
}
return
}
func main() {
lines, errr := readLines("foo.txt")
if errr != nil {
fmt.Println("Error: %s\n", errr)
return
}
for _, line := range lines {
fmt.Println(line)
}
errr = FixBegin(lines)
errr = writeLines(lines, "beer2.txt")
fmt.Println(errr)
errr = FixEnd(lines)
lines, errr = readLines("beer2.txt")
if errr != nil {
fmt.Println("Error: %s\n", errr)
return
}
errr = writeLines(lines, "beer2.txt")
fmt.Println(errr)
}
jnml#fsc-r630:~/src/tmp/SO/13789882$ ls
foo.txt main.go
jnml#fsc-r630:~/src/tmp/SO/13789882$ cat main.go
package main
import (
"bytes"
"io/ioutil"
"log"
)
func main() {
src, err := ioutil.ReadFile("foo.txt")
if err != nil {
log.Fatal(err)
}
src = bytes.Replace(src, []byte("BEGIN"), []byte("{"), -1)
src = bytes.Replace(src, []byte("END"), []byte("}"), -1)
if err = ioutil.WriteFile("beer2.txt", src, 0666); err != nil {
log.Fatal(err)
}
}
jnml#fsc-r630:~/src/tmp/SO/13789882$ cat foo.txt
BEGIN
FILE F(KIND=REMOTE);
EBCDIC ARRAY E[0:11];
REPLACE E BY "HELLO WORLD!";
WRITE(F, *, E);
END.
jnml#fsc-r630:~/src/tmp/SO/13789882$ go run main.go
jnml#fsc-r630:~/src/tmp/SO/13789882$ cat beer2.txt
{
FILE F(KIND=REMOTE);
EBCDIC ARRAY E[0:11];
REPLACE E BY "HELLO WORLD!";
WRITE(F, *, E);
}.
jnml#fsc-r630:~/src/tmp/SO/13789882$
I agree with #jnml wrt using ioutil to slurp the file and to write it back. But I think that the replacing shouldn't be done by multiple passes over []byte. Code and data are strings/text and should be treated as such (even if dealing with non ascii/utf8 encodings requires estra work); a one pass replacement (of all placeholders 'at once') avoids the risk of replacing results of previous changes (even if my regexp proposal must be improved to handle non-trivial tasks).
package main
import(
"fmt"
"io/ioutil"
"log"
"regexp"
"strings"
)
func main() {
// (1) slurp the file
data, err := ioutil.ReadFile("../tmpl/xpl.go")
if err != nil {
log.Fatal("ioutil.ReadFile: ", err)
}
s := string(data)
fmt.Printf("----\n%s----\n", s)
// => function that works for files of (known) other encodings that ascii or utf8
// (2) create a map that maps placeholder to be replaced to the replacements
x := map[string]string {
"BEGIN" : "{",
"END" : "}"}
ks := make([]string, 0, len(x))
for k := range x {
ks = append(ks, k)
}
// => function(s) that gets the keys from maps
// (3) create a regexp that finds the placeholder to be replaced
p := strings.Join(ks, "|")
fmt.Printf("/%s/\n", p)
r := regexp.MustCompile(p)
// => funny letters & order need more consideration
// (4) create a callback function for ..ReplaceAllStringFunc that knows
// about the map x
f := func(s string) string {
fmt.Printf("*** '%s'\n", s)
return x[s]
}
// => function (?) to do Step (2) .. (4) in a reusable way
// (5) do the replacing (s will be overwritten with the result)
s = r.ReplaceAllStringFunc(s, f)
fmt.Printf("----\n%s----\n", s)
// (6) write back
err = ioutil.WriteFile("result.go", []byte(s), 0644)
if err != nil {
log.Fatal("ioutil.WriteFile: ", err)
}
// => function that works for files of (known) other encodings that ascii or utf8
}
output:
go run 13789882.go
----
func main() BEGIN
END
----
/BEGIN|END/
*** 'BEGIN'
*** 'END'
----
func main() {
}
----
If your file size is huge, reading everything in memory might not be possible nor advised. Give BytesReplacingReader a try as it is done replacement in streaming fashion. And it's reasonably performant. If you want to replace two strings (such as BEGIN -> { and END -> }), just need to wrap two BytesReplacingReader over original reader, one for BEGIN and one for END:
r := NewBytesReplacingReader(
NewBytesReplacingReader(inputReader, []byte("BEGIN"), []byte("{"),
[]byte("END"), []byte("}")
// use r normally and all non-overlapping occurrences of
// "BEGIN" and "END" will be replaced with "{" and "}"