20 读取文件

0.1. 打开文件的方式

func readFile(fileName string) error {
  f, err := os.Open(fileName)
  defer file.Close()  // 不要忘记关闭文件
  if err != nil {
    fmt.Println("cannot able to read the file", err)
    return
  }
  return nil
}

打开文件后有两个选择:

  1. 逐行读取文件:减少内存负担,IO时间花销大
  2. 立即将整个文件读取到内存中:消耗更所内存,IO时间小
  3. 使用Golang中的bufio.NewReader()将整个文件加载到内存

buffercache的区别:

  1. buffer(缓冲)是为了提高内存硬盘(或其他I/0设备)之间的数据交换的速度而设计的。

  2. cache(缓存)是为了提高CPU内存之间的数据交换速度而设计,也就是平常见到的一级缓存、二级缓存、三级缓存。

func readFile(fileName string) error {
 f, err := os.Open(fileName)
 defer f.Close() // 不要忘记关闭文件
 if err != nil {
  fmt.Println("cannot able to read the file", err)
  return err
 }

 r := bufio.NewReader(f)
 for {
  buf := make([]byte, 4*1024) // 每个块的大小
  n, err := r.Read(buf)       // 加载块到缓冲(buffer)中
  buf = buf[:n]
  if n == 0 {
   if err == nil {
    fmt.Println(err)
    break
   }
   if err == io.EOF {
    break
   }
   return err
  }
 }
 return nil
}

将上面的代码进行优化,使用goroutine对多个块同时处理。

完整代码在这里

package main

import (
 "bufio"
 "fmt"
 "io"
 "math"
 "os"
 "strings"
 "sync"
 "time"
)

func main() {

 s := time.Now()
 args := os.Args[1:]
 if len(args) != 6 { // for format  LogExtractor.exe -f "From Time" -t "To Time" -i "Log file directory location"
  fmt.Println("Please give proper command line arguments")
  return
 }
 startTimeArg := args[1]
 finishTimeArg := args[3]
 fileName := args[5]

 file, err := os.Open(fileName)
;
 if err != nil {
  fmt.Println("cannot able to read the file", err)
  return
 }

 defer file.Close() // close after checking err

 queryStartTime, err := time.Parse("2006-01-02T15:04:05.0000Z", startTimeArg)
 if err != nil {
  fmt.Println("Could not able to parse the start time", startTimeArg)
  return
 }

 queryFinishTime, err := time.Parse("2006-01-02T15:04:05.0000Z", finishTimeArg)
 if err != nil {
  fmt.Println("Could not able to parse the finish time", finishTimeArg)
  return
 }

 fileStat, err := file.Stat()
 if err != nil {
  fmt.Println("Could not able to get the file stat")
  return
 }

 fileSize := fileStat.Size()
 offset := fileSize - 1
 lastLineSize := 0

 for {
  b := make([]byte, 1)
  n, err := file.ReadAt(b, offset)
  if err != nil {
   fmt.Println("Error reading file ", err)
   break
  }
  char := string(b[0])
  if char == "\n" {
   break
  }
  offset--
  lastLineSize += n
 }

 lastLine := make([]byte, lastLineSize)
 _, err = file.ReadAt(lastLine, offset+1)

 if err != nil {
  fmt.Println("Could not able to read last line with offset", offset, "and last line size", lastLineSize)
  return
 }

 logSlice := strings.SplitN(string(lastLine), ",", 2)
 logCreationTimeString := logSlice[0]

 lastLogCreationTime, err := time.Parse("2006-01-02T15:04:05.0000Z", logCreationTimeString)
 if err != nil {
  fmt.Println("can not able to parse time : ", err)
 }

 if lastLogCreationTime.After(queryStartTime) && lastLogCreationTime.Before(queryFinishTime) {
  Process(file, queryStartTime, queryFinishTime)
 }

 fmt.Println("\nTime taken - ", time.Since(s))
}

func Process(f *os.File, start time.Time, end time.Time) error {

 linesPool := sync.Pool{New: func() interface{} {
  lines := make([]byte, 250*1024)
  return lines
 }}

 stringPool := sync.Pool{New: func() interface{} {
  lines := ""
  return lines
 }}

 r := bufio.NewReader(f)

 var wg sync.WaitGroup

 for {
  buf := linesPool.Get().([]byte)

  n, err := r.Read(buf)
  buf = buf[:n]

  if n == 0 {
   if err != nil {
    fmt.Println(err)
    break
   }
   if err == io.EOF {
    break
   }
   return err
  }

  nextUntilNewline, err := r.ReadBytes('\n')

  if err != io.EOF {
   buf = append(buf, nextUntilNewline...)
  }

  wg.Add(1)
  go func() {
   ProcessChunk(buf, &linesPool, &stringPool, start, end)
   wg.Done()
  }()

 }

 wg.Wait()
 return nil
}

func ProcessChunk(chunk []byte, linesPool *sync.Pool, stringPool *sync.Pool, start time.Time, end time.Time) {

 var wg2 sync.WaitGroup

 logs := stringPool.Get().(string)
 logs = string(chunk)

 linesPool.Put(chunk)

 logsSlice := strings.Split(logs, "\n")

 stringPool.Put(logs)

 chunkSize := 300
 n := len(logsSlice)
 noOfThread := n / chunkSize

 if n%chunkSize != 0 {
  noOfThread++
 }

 for i := 0; i < (noOfThread); i++ {

  wg2.Add(1)
  go func(s int, e int) {
   defer wg2.Done() // to avoid deadlocks
   for i := s; i < e; i++ {
    text := logsSlice[i]
    if len(text) == 0 {
     continue
    }
    logSlice := strings.SplitN(text, ",", 2)
    logCreationTimeString := logSlice[0]

    logCreationTime, err := time.Parse("2006-01-02T15:04:05.0000Z", logCreationTimeString)
    if err != nil {
     fmt.Printf("\n Could not able to parse the time :%s for log : %v", logCreationTimeString, text)
     return
    }

    if logCreationTime.After(start) && logCreationTime.Before(end) {
     // fmt.Println(text)
    }
   }

  }(i*chunkSize, int(math.Min(float64((i+1)*chunkSize), float64(len(logsSlice)))))
 }

 wg2.Wait()
 logsSlice = nil
}

0.2. 判断文件大小的方式

0.2.1. 打开文件后把内容全部读一遍并统计字节数

func main() {
    file,err:=os.Open("water")
    if err ==nil {
        sum := 0
        buf:=make([]byte,2014)
        for  {
            n,err:=file.Read(buf)
            sum+=n
            if err==io.EOF {
                break
            }
        }
        fmt.Println("file size is ",sum)
    }
}

通过for循环读取文件的字节内容,然后算出文件的大小,效率低,代码量大。

0.2.2. ioutil.ReadFile

func main() {
    content,err:=ioutil.ReadFile("water")
    if err == nil {
        fmt.Println("file size is ",len(content))
    }
}

使用ioutil包的ReadFile来代替,直接获得文件的内容,进而计算出文件的大小。

看看ReadFile的具体实现:

// ReadFile reads the file named by filename and returns the contents.
// A successful call returns err == nil, not err == EOF. Because ReadFile
// reads the whole file, it does not treat an EOF from Read as an error
// to be reported.
func ReadFile(filename string) ([]byte, error) {
 f, err := os.Open(filename)
 if err != nil {
  return nil, err
 }
 defer f.Close()
 // It's a good but not certain bet that FileInfo will tell us exactly how much to
 // read, so let's try it but be prepared for the answer to be wrong.
 var n int64 = bytes.MinRead

 if fi, err := f.Stat(); err == nil {
  // As initial capacity for readAll, use Size + a little extra in case Size
  // is zero, and to avoid another allocation after Read has filled the
  // buffer. The readAll call will read into its allocated internal buffer
  // cheaply. If the size was wrong, we'll either waste some space off the end
  // or reallocate as needed, but in the overwhelmingly common case we'll get
  // it just right.
  if size := fi.Size() + bytes.MinRead; size > n {
   n = size
  }
 }
 return readAll(f, n)
}

发现里面用了file的Stat()方法。

0.2.3. File.State

func main() {
    file,err:=os.Open("water")

    if err == nil {
        fi,_:=file.Stat()
        fmt.Println("file size is ",fi.Size())
    }
}

看一下file的State的实现:

// Stat returns the FileInfo structure describing file.
// If there is an error, it will be of type *PathError.
func (f *File) Stat() (FileInfo, error) {
 if f == nil {
  return nil, ErrInvalid
 }
 var fs fileStat
 err := f.pfd.Fstat(&fs.sys)
 if err != nil {
  return nil, &PathError{"stat", f.name, err}
 }
 fillFileStatFromSys(&fs, f.name)
 return &fs, nil
}

0.2.4. os.Stat

func main() {
    fi,err:=os.Stat("water")
    if err ==nil {
        fmt.Println("file size is ",fi.Size(),err)
    }
}

看一下os的Stat的实现:

// Stat returns a FileInfo describing the named file.
// If there is an error, it will be of type *PathError.
func Stat(name string) (FileInfo, error) {
 testlog.Stat(name)
 return statNolog(name)
}

os.Stat的其他用途

// 获取文件信息
func main() {
    fi,err:=os.Stat("water")
    if err ==nil {
        fmt.Println("name:",fi.Name())
        fmt.Println("size:",fi.Size())
        fmt.Println("is dir:",fi.IsDir())
        fmt.Println("mode::",fi.Mode())
        fmt.Println("modTime:",fi.ModTime())
    }
}

// 判断文件是否存在
func main() {
    _,err:=os.Stat(".")
    if err ==nil {
        fmt.Println("file exist")
    }else if os.IsNotExist(err){
        fmt.Println("file not exist")
    }else{
        fmt.Println(err)
    }
}
上次修改: 31 July 2020