func readFile(fileName string) error { f, err := os.Open(fileName) defer file.Close() // 不要忘记关闭文件 if err != nil { fmt.Println("cannot able to read the file", err) return } return nil }
打开文件后有两个选择:
bufio.NewReader()将整个文件加载到内存
buffer与cache的区别:
buffer(缓冲)是为了提高内存和硬盘(或其他I/0设备)之间的数据交换的速度而设计的。
cache(缓存)是为了提高CPU和内存之间的数据交换速度而设计,也就是平常见到的一级缓存、二级缓存、三级缓存。
func readFile(fileName string) error { f, err := os.Open(fileName) defer f.Close() // 不要忘记关闭文件 if err != nil { fmt.Println("cannot able to read the file", err) return err } r := bufio.NewReader(f) for { buf := make([]byte, 4*1024) // 每个块的大小 n, err := r.Read(buf) // 加载块到缓冲(buffer)中 buf = buf[:n] if n == 0 { if err == nil { fmt.Println(err) break } if err == io.EOF { break } return err } } return nil }
将上面的代码进行优化,使用goroutine对多个块同时处理。
完整代码在这里
package main import ( "bufio" "fmt" "io" "math" "os" "strings" "sync" "time" ) func main() { s := time.Now() args := os.Args[1:] if len(args) != 6 { // for format LogExtractor.exe -f "From Time" -t "To Time" -i "Log file directory location" fmt.Println("Please give proper command line arguments") return } startTimeArg := args[1] finishTimeArg := args[3] fileName := args[5] file, err := os.Open(fileName) ; if err != nil { fmt.Println("cannot able to read the file", err) return } defer file.Close() // close after checking err queryStartTime, err := time.Parse("2006-01-02T15:04:05.0000Z", startTimeArg) if err != nil { fmt.Println("Could not able to parse the start time", startTimeArg) return } queryFinishTime, err := time.Parse("2006-01-02T15:04:05.0000Z", finishTimeArg) if err != nil { fmt.Println("Could not able to parse the finish time", finishTimeArg) return } fileStat, err := file.Stat() if err != nil { fmt.Println("Could not able to get the file stat") return } fileSize := fileStat.Size() offset := fileSize - 1 lastLineSize := 0 for { b := make([]byte, 1) n, err := file.ReadAt(b, offset) if err != nil { fmt.Println("Error reading file ", err) break } char := string(b[0]) if char == "\n" { break } offset-- lastLineSize += n } lastLine := make([]byte, lastLineSize) _, err = file.ReadAt(lastLine, offset+1) if err != nil { fmt.Println("Could not able to read last line with offset", offset, "and last line size", lastLineSize) return } logSlice := strings.SplitN(string(lastLine), ",", 2) logCreationTimeString := logSlice[0] lastLogCreationTime, err := time.Parse("2006-01-02T15:04:05.0000Z", logCreationTimeString) if err != nil { fmt.Println("can not able to parse time : ", err) } if lastLogCreationTime.After(queryStartTime) && lastLogCreationTime.Before(queryFinishTime) { Process(file, queryStartTime, queryFinishTime) } fmt.Println("\nTime taken - ", time.Since(s)) } func Process(f *os.File, start time.Time, end time.Time) error { linesPool := sync.Pool{New: func() interface{} { lines := make([]byte, 250*1024) return lines }} stringPool := sync.Pool{New: func() interface{} { lines := "" return lines }} r := bufio.NewReader(f) var wg sync.WaitGroup for { buf := linesPool.Get().([]byte) n, err := r.Read(buf) buf = buf[:n] if n == 0 { if err != nil { fmt.Println(err) break } if err == io.EOF { break } return err } nextUntilNewline, err := r.ReadBytes('\n') if err != io.EOF { buf = append(buf, nextUntilNewline...) } wg.Add(1) go func() { ProcessChunk(buf, &linesPool, &stringPool, start, end) wg.Done() }() } wg.Wait() return nil } func ProcessChunk(chunk []byte, linesPool *sync.Pool, stringPool *sync.Pool, start time.Time, end time.Time) { var wg2 sync.WaitGroup logs := stringPool.Get().(string) logs = string(chunk) linesPool.Put(chunk) logsSlice := strings.Split(logs, "\n") stringPool.Put(logs) chunkSize := 300 n := len(logsSlice) noOfThread := n / chunkSize if n%chunkSize != 0 { noOfThread++ } for i := 0; i < (noOfThread); i++ { wg2.Add(1) go func(s int, e int) { defer wg2.Done() // to avoid deadlocks for i := s; i < e; i++ { text := logsSlice[i] if len(text) == 0 { continue } logSlice := strings.SplitN(text, ",", 2) logCreationTimeString := logSlice[0] logCreationTime, err := time.Parse("2006-01-02T15:04:05.0000Z", logCreationTimeString) if err != nil { fmt.Printf("\n Could not able to parse the time :%s for log : %v", logCreationTimeString, text) return } if logCreationTime.After(start) && logCreationTime.Before(end) { // fmt.Println(text) } } }(i*chunkSize, int(math.Min(float64((i+1)*chunkSize), float64(len(logsSlice))))) } wg2.Wait() logsSlice = nil }
func main() { file,err:=os.Open("water") if err ==nil { sum := 0 buf:=make([]byte,2014) for { n,err:=file.Read(buf) sum+=n if err==io.EOF { break } } fmt.Println("file size is ",sum) } }
通过for循环读取文件的字节内容,然后算出文件的大小,效率低,代码量大。
func main() { content,err:=ioutil.ReadFile("water") if err == nil { fmt.Println("file size is ",len(content)) } }
使用ioutil包的ReadFile来代替,直接获得文件的内容,进而计算出文件的大小。
看看ReadFile的具体实现:
// ReadFile reads the file named by filename and returns the contents. // A successful call returns err == nil, not err == EOF. Because ReadFile // reads the whole file, it does not treat an EOF from Read as an error // to be reported. func ReadFile(filename string) ([]byte, error) { f, err := os.Open(filename) if err != nil { return nil, err } defer f.Close() // It's a good but not certain bet that FileInfo will tell us exactly how much to // read, so let's try it but be prepared for the answer to be wrong. var n int64 = bytes.MinRead if fi, err := f.Stat(); err == nil { // As initial capacity for readAll, use Size + a little extra in case Size // is zero, and to avoid another allocation after Read has filled the // buffer. The readAll call will read into its allocated internal buffer // cheaply. If the size was wrong, we'll either waste some space off the end // or reallocate as needed, but in the overwhelmingly common case we'll get // it just right. if size := fi.Size() + bytes.MinRead; size > n { n = size } } return readAll(f, n) }
发现里面用了file的Stat()方法。
func main() { file,err:=os.Open("water") if err == nil { fi,_:=file.Stat() fmt.Println("file size is ",fi.Size()) } }
看一下file的State的实现:
// Stat returns the FileInfo structure describing file. // If there is an error, it will be of type *PathError. func (f *File) Stat() (FileInfo, error) { if f == nil { return nil, ErrInvalid } var fs fileStat err := f.pfd.Fstat(&fs.sys) if err != nil { return nil, &PathError{"stat", f.name, err} } fillFileStatFromSys(&fs, f.name) return &fs, nil }
func main() { fi,err:=os.Stat("water") if err ==nil { fmt.Println("file size is ",fi.Size(),err) } }
看一下os的Stat的实现:
// Stat returns a FileInfo describing the named file. // If there is an error, it will be of type *PathError. func Stat(name string) (FileInfo, error) { testlog.Stat(name) return statNolog(name) }
// 获取文件信息 func main() { fi,err:=os.Stat("water") if err ==nil { fmt.Println("name:",fi.Name()) fmt.Println("size:",fi.Size()) fmt.Println("is dir:",fi.IsDir()) fmt.Println("mode::",fi.Mode()) fmt.Println("modTime:",fi.ModTime()) } } // 判断文件是否存在 func main() { _,err:=os.Stat(".") if err ==nil { fmt.Println("file exist") }else if os.IsNotExist(err){ fmt.Println("file not exist") }else{ fmt.Println(err) } }