func readFile(fileName string) error {
f, err := os.Open(fileName)
defer file.Close() // 不要忘记关闭文件
if err != nil {
fmt.Println("cannot able to read the file", err)
return
}
return nil
}
打开文件后有两个选择:
bufio.NewReader()
将整个文件加载到内存
buffer
与cache
的区别:
buffer
(缓冲)是为了提高内存和硬盘(或其他I/0设备)之间的数据交换的速度而设计的。
cache
(缓存)是为了提高CPU和内存之间的数据交换速度而设计,也就是平常见到的一级缓存、二级缓存、三级缓存。
func readFile(fileName string) error {
f, err := os.Open(fileName)
defer f.Close() // 不要忘记关闭文件
if err != nil {
fmt.Println("cannot able to read the file", err)
return err
}
r := bufio.NewReader(f)
for {
buf := make([]byte, 4*1024) // 每个块的大小
n, err := r.Read(buf) // 加载块到缓冲(buffer)中
buf = buf[:n]
if n == 0 {
if err == nil {
fmt.Println(err)
break
}
if err == io.EOF {
break
}
return err
}
}
return nil
}
将上面的代码进行优化,使用goroutine对多个块同时处理。
完整代码在这里
package main
import (
"bufio"
"fmt"
"io"
"math"
"os"
"strings"
"sync"
"time"
)
func main() {
s := time.Now()
args := os.Args[1:]
if len(args) != 6 { // for format LogExtractor.exe -f "From Time" -t "To Time" -i "Log file directory location"
fmt.Println("Please give proper command line arguments")
return
}
startTimeArg := args[1]
finishTimeArg := args[3]
fileName := args[5]
file, err := os.Open(fileName)
;
if err != nil {
fmt.Println("cannot able to read the file", err)
return
}
defer file.Close() // close after checking err
queryStartTime, err := time.Parse("2006-01-02T15:04:05.0000Z", startTimeArg)
if err != nil {
fmt.Println("Could not able to parse the start time", startTimeArg)
return
}
queryFinishTime, err := time.Parse("2006-01-02T15:04:05.0000Z", finishTimeArg)
if err != nil {
fmt.Println("Could not able to parse the finish time", finishTimeArg)
return
}
fileStat, err := file.Stat()
if err != nil {
fmt.Println("Could not able to get the file stat")
return
}
fileSize := fileStat.Size()
offset := fileSize - 1
lastLineSize := 0
for {
b := make([]byte, 1)
n, err := file.ReadAt(b, offset)
if err != nil {
fmt.Println("Error reading file ", err)
break
}
char := string(b[0])
if char == "\n" {
break
}
offset--
lastLineSize += n
}
lastLine := make([]byte, lastLineSize)
_, err = file.ReadAt(lastLine, offset+1)
if err != nil {
fmt.Println("Could not able to read last line with offset", offset, "and last line size", lastLineSize)
return
}
logSlice := strings.SplitN(string(lastLine), ",", 2)
logCreationTimeString := logSlice[0]
lastLogCreationTime, err := time.Parse("2006-01-02T15:04:05.0000Z", logCreationTimeString)
if err != nil {
fmt.Println("can not able to parse time : ", err)
}
if lastLogCreationTime.After(queryStartTime) && lastLogCreationTime.Before(queryFinishTime) {
Process(file, queryStartTime, queryFinishTime)
}
fmt.Println("\nTime taken - ", time.Since(s))
}
func Process(f *os.File, start time.Time, end time.Time) error {
linesPool := sync.Pool{New: func() interface{} {
lines := make([]byte, 250*1024)
return lines
}}
stringPool := sync.Pool{New: func() interface{} {
lines := ""
return lines
}}
r := bufio.NewReader(f)
var wg sync.WaitGroup
for {
buf := linesPool.Get().([]byte)
n, err := r.Read(buf)
buf = buf[:n]
if n == 0 {
if err != nil {
fmt.Println(err)
break
}
if err == io.EOF {
break
}
return err
}
nextUntilNewline, err := r.ReadBytes('\n')
if err != io.EOF {
buf = append(buf, nextUntilNewline...)
}
wg.Add(1)
go func() {
ProcessChunk(buf, &linesPool, &stringPool, start, end)
wg.Done()
}()
}
wg.Wait()
return nil
}
func ProcessChunk(chunk []byte, linesPool *sync.Pool, stringPool *sync.Pool, start time.Time, end time.Time) {
var wg2 sync.WaitGroup
logs := stringPool.Get().(string)
logs = string(chunk)
linesPool.Put(chunk)
logsSlice := strings.Split(logs, "\n")
stringPool.Put(logs)
chunkSize := 300
n := len(logsSlice)
noOfThread := n / chunkSize
if n%chunkSize != 0 {
noOfThread++
}
for i := 0; i < (noOfThread); i++ {
wg2.Add(1)
go func(s int, e int) {
defer wg2.Done() // to avoid deadlocks
for i := s; i < e; i++ {
text := logsSlice[i]
if len(text) == 0 {
continue
}
logSlice := strings.SplitN(text, ",", 2)
logCreationTimeString := logSlice[0]
logCreationTime, err := time.Parse("2006-01-02T15:04:05.0000Z", logCreationTimeString)
if err != nil {
fmt.Printf("\n Could not able to parse the time :%s for log : %v", logCreationTimeString, text)
return
}
if logCreationTime.After(start) && logCreationTime.Before(end) {
// fmt.Println(text)
}
}
}(i*chunkSize, int(math.Min(float64((i+1)*chunkSize), float64(len(logsSlice)))))
}
wg2.Wait()
logsSlice = nil
}
func main() {
file,err:=os.Open("water")
if err ==nil {
sum := 0
buf:=make([]byte,2014)
for {
n,err:=file.Read(buf)
sum+=n
if err==io.EOF {
break
}
}
fmt.Println("file size is ",sum)
}
}
通过for循环读取文件的字节内容,然后算出文件的大小,效率低,代码量大。
func main() {
content,err:=ioutil.ReadFile("water")
if err == nil {
fmt.Println("file size is ",len(content))
}
}
使用ioutil包的ReadFile来代替,直接获得文件的内容,进而计算出文件的大小。
看看ReadFile的具体实现:
// ReadFile reads the file named by filename and returns the contents.
// A successful call returns err == nil, not err == EOF. Because ReadFile
// reads the whole file, it does not treat an EOF from Read as an error
// to be reported.
func ReadFile(filename string) ([]byte, error) {
f, err := os.Open(filename)
if err != nil {
return nil, err
}
defer f.Close()
// It's a good but not certain bet that FileInfo will tell us exactly how much to
// read, so let's try it but be prepared for the answer to be wrong.
var n int64 = bytes.MinRead
if fi, err := f.Stat(); err == nil {
// As initial capacity for readAll, use Size + a little extra in case Size
// is zero, and to avoid another allocation after Read has filled the
// buffer. The readAll call will read into its allocated internal buffer
// cheaply. If the size was wrong, we'll either waste some space off the end
// or reallocate as needed, but in the overwhelmingly common case we'll get
// it just right.
if size := fi.Size() + bytes.MinRead; size > n {
n = size
}
}
return readAll(f, n)
}
发现里面用了file的Stat()方法。
func main() {
file,err:=os.Open("water")
if err == nil {
fi,_:=file.Stat()
fmt.Println("file size is ",fi.Size())
}
}
看一下file的State的实现:
// Stat returns the FileInfo structure describing file.
// If there is an error, it will be of type *PathError.
func (f *File) Stat() (FileInfo, error) {
if f == nil {
return nil, ErrInvalid
}
var fs fileStat
err := f.pfd.Fstat(&fs.sys)
if err != nil {
return nil, &PathError{"stat", f.name, err}
}
fillFileStatFromSys(&fs, f.name)
return &fs, nil
}
func main() {
fi,err:=os.Stat("water")
if err ==nil {
fmt.Println("file size is ",fi.Size(),err)
}
}
看一下os的Stat的实现:
// Stat returns a FileInfo describing the named file.
// If there is an error, it will be of type *PathError.
func Stat(name string) (FileInfo, error) {
testlog.Stat(name)
return statNolog(name)
}
// 获取文件信息
func main() {
fi,err:=os.Stat("water")
if err ==nil {
fmt.Println("name:",fi.Name())
fmt.Println("size:",fi.Size())
fmt.Println("is dir:",fi.IsDir())
fmt.Println("mode::",fi.Mode())
fmt.Println("modTime:",fi.ModTime())
}
}
// 判断文件是否存在
func main() {
_,err:=os.Stat(".")
if err ==nil {
fmt.Println("file exist")
}else if os.IsNotExist(err){
fmt.Println("file not exist")
}else{
fmt.Println(err)
}
}