Skip to content

Commit

Permalink
Merge pull request #19 from will-rowe/cleanup
Browse files Browse the repository at this point in the history
replacing lsh ensemble with lsh forest - removing containment search until performance can be improved
  • Loading branch information
Will Rowe authored Apr 20, 2019
2 parents f522bf4 + e1c629b commit 92ead3b
Show file tree
Hide file tree
Showing 22 changed files with 435 additions and 910 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ os:

language: go
go:
- 1.9
- tip # The latest version of Go.

install:
- go get -d -t -v ./...
Expand Down
28 changes: 9 additions & 19 deletions cmd/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ import (
"github.com/pkg/profile"
"github.com/spf13/cobra"
"github.com/will-rowe/groot/src/graph"
"github.com/will-rowe/groot/src/lshIndex"
"github.com/will-rowe/groot/src/lshForest"
"github.com/will-rowe/groot/src/misc"
"github.com/will-rowe/groot/src/stream"
"github.com/will-rowe/groot/src/version"
Expand Down Expand Up @@ -160,8 +160,8 @@ func alignParamCheck() error {
func runAlign() {
// set up profiling
if *profiling == true {
defer profile.Start(profile.MemProfile, profile.ProfilePath("./")).Stop()
//defer profile.Start(profile.ProfilePath("./")).Stop()
//defer profile.Start(profile.MemProfile, profile.ProfilePath("./")).Stop()
defer profile.Start(profile.ProfilePath("./")).Stop()
}
// start logging
logFH := misc.StartLogging(*logFile)
Expand All @@ -187,29 +187,21 @@ func runAlign() {
log.Print("loading index information...")
info := new(misc.IndexInfo)
misc.ErrorCheck(info.Load(*indexDir + "/index.info"))
if info.Containment {
log.Printf("\tindex type: lshEnsemble")
log.Printf("\tcontainment search seeding: enabled")
} else {
log.Printf("\tindex type: lshForest")
log.Printf("\tcontainment search seeding: disabled")
}
log.Printf("\twindow sized used in indexing: %d\n", info.ReadLength)
log.Printf("\tk-mer size: %d\n", info.Ksize)
log.Printf("\tsignature size: %d\n", info.SigSize)
log.Printf("\tJaccard similarity theshold: %0.2f\n", info.JSthresh)
log.Printf("\twindow sized used in indexing: %d\n", info.ReadLength)
log.Print("loading the groot graphs...")
graphStore := make(graph.GraphStore)
misc.ErrorCheck(graphStore.Load(*indexDir + "/index.graph"))
log.Printf("\tnumber of variation graphs: %d\n", len(graphStore))
log.Print("loading the MinHash signatures...")
var database *lshIndex.LshEnsemble
if info.Containment {
database = lshIndex.NewLSHensemble(make([]lshIndex.Partition, lshIndex.PARTITIONS), info.SigSize, lshIndex.MAXK)
} else {
database = lshIndex.NewLSHforest(info.SigSize, info.JSthresh)
}
database := lshForest.NewLSHforest(info.SigSize, info.JSthresh)
misc.ErrorCheck(database.Load(*indexDir + "/index.sigs"))
database.Index()
numHF, numBucks := database.Settings()
log.Printf("\tnumber of hash functions per bucket: %d\n", numHF)
log.Printf("\tnumber of buckets: %d\n", numBucks)
///////////////////////////////////////////////////////////////////////////////////////
// create SAM references from the sequences held in the graphs
referenceMap, err := graphStore.GetRefs()
Expand All @@ -230,12 +222,10 @@ func runAlign() {

// add in the process parameters
dataStream.InputFile = *fastq
fastqChecker.Containment = info.Containment
fastqChecker.WindowSize = info.ReadLength
dbQuerier.Db = database
dbQuerier.CommandInfo = info
dbQuerier.GraphStore = graphStore
dbQuerier.Threshold = info.JSthresh
graphAligner.GraphStore = graphStore
graphAligner.RefMap = referenceMap
graphAligner.MaxClip = *clip
Expand Down
96 changes: 42 additions & 54 deletions cmd/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import (
"github.com/spf13/cobra"
"github.com/will-rowe/gfa"
"github.com/will-rowe/groot/src/graph"
"github.com/will-rowe/groot/src/lshIndex"
"github.com/will-rowe/groot/src/lshForest"
"github.com/will-rowe/groot/src/misc"
"github.com/will-rowe/groot/src/seqio"
"github.com/will-rowe/groot/src/version"
Expand All @@ -51,9 +51,6 @@ var (
msaList []string // the collected MSA files
outDir *string // directory to save index files and log to
defaultOutDir = "./groot-index-" + string(time.Now().Format("20060102150405")) // a default dir to store the index files
containment *bool // use lshEnsemble instead of lshForest -- allows for variable read length
maxK *int // the maxK for LSH Ensemble (only active for --containment)
numPart *int // the number of partitions for LSH Ensemble (only active for --containment)
)

// the index command (used by cobra)
Expand All @@ -71,15 +68,12 @@ var indexCmd = &cobra.Command{

// a function to initialise the command line arguments
func init() {
kSize = indexCmd.Flags().IntP("kmerSize", "k", 31, "size of k-mer")
sigSize = indexCmd.Flags().IntP("sigSize", "s", 84, "size of MinHash signature")
readLength = indexCmd.Flags().IntP("readLength", "l", 100, "max length of query reads (which will be aligned during the align subcommand)")
jsThresh = indexCmd.Flags().Float64P("jsThresh", "j", 0.99, "minimum Jaccard similarity for a seed to be recorded (note: this is used as a containment theshold when --containment set")
kSize = indexCmd.Flags().IntP("kmerSize", "k", 7, "size of k-mer")
sigSize = indexCmd.Flags().IntP("sigSize", "s", 128, "size of MinHash signature")
readLength = indexCmd.Flags().IntP("readLength", "l", 100, "length of query reads (which will be aligned during the align subcommand)")
jsThresh = indexCmd.Flags().Float64P("jsThresh", "j", 0.99, "minimum Jaccard similarity for a seed to be recorded")
msaDir = indexCmd.Flags().StringP("msaDir", "i", "", "directory containing the clustered references (MSA files) - required")
outDir = indexCmd.PersistentFlags().StringP("outDir", "o", defaultOutDir, "directory to save index files to")
containment = indexCmd.Flags().BoolP("containment", "c", false, "use lshEnsemble instead of lshForest (allows read length variation > 10 bases)")
maxK = indexCmd.Flags().IntP("maxK", "m", 4, "maxK in LSH Ensemble (only active with --containment)")
numPart = indexCmd.Flags().IntP("numPart", "n", 4, "num. partitions in LSH Ensemble (only active with --containment)")
indexCmd.MarkFlagRequired("msaDir")
RootCmd.AddCommand(indexCmd)
}
Expand Down Expand Up @@ -142,11 +136,6 @@ func runIndex() {
// check the supplied files and then log some stuff
log.Printf("checking parameters...")
misc.ErrorCheck(indexParamCheck())
if *containment {
log.Printf("\tindexing scheme: lshEnsemble (containment search)")
} else {
log.Printf("\tindexing scheme: lshForest")
}
log.Printf("\tprocessors: %d", *proc)
log.Printf("\tk-mer size: %d", *kSize)
log.Printf("\tsignature size: %d", *sigSize)
Expand Down Expand Up @@ -209,57 +198,56 @@ func runIndex() {
}()
///////////////////////////////////////////////////////////////////////////////////////
// collect and store the GrootGraph windows
sigStore := []*lshIndex.GraphWindow{}
lookupMap := make(lshIndex.KeyLookupMap)
var sigStore = make([]map[int]map[int][][]uint64, len(graphStore))
for i := range sigStore {
sigStore[i] = make(map[int]map[int][][]uint64)
}
// receive the signatures
var sigCount int = 0
for window := range windowChan {
// combine graphID, nodeID and offset to form a string key for signature
stringKey := fmt.Sprintf("g%dn%do%d", window.GraphID, window.Node, window.OffSet)
// convert to a graph window
gw := &lshIndex.GraphWindow{stringKey, *readLength, window.Sig}
// initialise the inner map of sigStore if graph has not been seen yet
if _, ok := sigStore[window.GraphID][window.Node]; !ok {
sigStore[window.GraphID][window.Node] = make(map[int][][]uint64)
}
// store the signature for the graph:node:offset
sigStore = append(sigStore, gw)
// add a key to the lookup map
lookupMap[stringKey] = seqio.Key{GraphID: window.GraphID, Node: window.Node, OffSet: window.OffSet}
sigStore[window.GraphID][window.Node][window.OffSet] = append(sigStore[window.GraphID][window.Node][window.OffSet], window.Sig)
sigCount++
}
numSigs := len(sigStore)
log.Printf("\tnumber of signatures generated: %d\n", numSigs)
var database *lshIndex.LshEnsemble
if *containment == false {
///////////////////////////////////////////////////////////////////////////////////////
// run LSH forest
log.Printf("running LSH Forest...\n")
database = lshIndex.NewLSHforest(*sigSize, *jsThresh)
// range over the nodes in each graph, each node will have one or more signature
for window := range lshIndex.Windows2Chan(sigStore) {
// add the signature to the lshForest
database.Add(window.Key, window.Signature, 0)
log.Printf("\tnumber of signatures generated: %d\n", sigCount)
///////////////////////////////////////////////////////////////////////////////////////
// run LSH forest
log.Printf("running LSH forest...\n")
database := lshForest.NewLSHforest(*sigSize, *jsThresh)
// range over the nodes in each graph, each node will have one or more signature
for graphID, nodesMap := range sigStore {
// add each signature to the database
for nodeID, offsetMap := range nodesMap {
for offset, signatures := range offsetMap {
for _, signature := range signatures {
// combine graphID, nodeID and offset to form a string key for signature
stringKey := fmt.Sprintf("g%dn%do%d", graphID, nodeID, offset)
// add the key to a lookup map
key := seqio.Key{GraphID: graphID, Node: nodeID, OffSet: offset}
database.KeyLookup[stringKey] = key
// add the signature to the lshForest
database.Add(stringKey, signature)
}
}
}
// print some stuff
numHF, numBucks := database.Lshes[0].Settings()
log.Printf("\tnumber of hash functions per bucket: %d\n", numHF)
log.Printf("\tnumber of buckets: %d\n", numBucks)
} else {
///////////////////////////////////////////////////////////////////////////////////////
// run LSH ensemble (https://github.com/ekzhu/lshensemble)
log.Printf("running LSH Ensemble...\n")
database = lshIndex.BootstrapLshEnsemble(*numPart, *sigSize, *maxK, numSigs, lshIndex.Windows2Chan(sigStore))
// print some stuff
log.Printf("\tnumber of LSH Ensemble partitions: %d\n", *numPart)
log.Printf("\tmax no. hash functions per bucket: %d\n", *maxK)
}
// attach the key lookup map to the index
database.KeyLookup = lookupMap
numHF, numBucks := database.Settings()
log.Printf("\tnumber of hash functions per bucket: %d\n", numHF)
log.Printf("\tnumber of buckets: %d\n", numBucks)
///////////////////////////////////////////////////////////////////////////////////////
// record runtime info
info := &misc.IndexInfo{Version: version.VERSION, Ksize: *kSize, SigSize: *sigSize, JSthresh: *jsThresh, ReadLength: *readLength, Containment: *containment}
info := &misc.IndexInfo{Version: version.VERSION, Ksize: *kSize, SigSize: *sigSize, JSthresh: *jsThresh, ReadLength: *readLength}
// save the index files
log.Printf("saving index files to \"%v\"...", *outDir)
misc.ErrorCheck(database.Dump(*outDir + "/index.sigs"))
log.Printf("\tsaved MinHash signatures")
misc.ErrorCheck(info.Dump(*outDir + "/index.info"))
log.Printf("\tsaved runtime info")
misc.ErrorCheck(graphStore.Dump(*outDir + "/index.graph"))
log.Printf("\tsaved groot graphs")
misc.ErrorCheck(database.Dump(*outDir + "/index.sigs"))
log.Printf("\tsaved MinHash signatures")
log.Println("finished")
}
2 changes: 1 addition & 1 deletion cmd/report.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func reportParamCheck() error {
log.Printf("\tBAM file: %v", *bamFile)
}
if *covCutoff > 1.0 {
return fmt.Errorf("supplied coverage cutoff exceeds 1.0 (100%%): %.2f", *covCutoff)
return fmt.Errorf("supplied coverage cutoff exceeds 1.0 (100%%): %v", *covCutoff)
}
return nil
}
Expand Down
9 changes: 5 additions & 4 deletions paper/benchmarking/accuracy/accuracy-test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ package main
import (
"flag"
"fmt"
"github.com/biogo/hts/bam"
"github.com/biogo/hts/bgzf"
"github.com/biogo/hts/sam"
"io"
"log"
"os"
"strings"

"github.com/biogo/hts/bam"
"github.com/biogo/hts/bgzf"
"github.com/biogo/hts/sam"
)

var inputFile = flag.String("bam", "", "bam file to run accuracy test on")
Expand All @@ -28,7 +29,7 @@ func main() {
log.Fatalf("could not open bam file %q:", err)
}
if !ok {
log.Printf("file %q has no bgzf magic block: may be truncated", inputFile)
log.Printf("file %v has no bgzf magic block: may be truncated", inputFile)
}
r = f
b, err := bam.NewReader(r, 0)
Expand Down
30 changes: 12 additions & 18 deletions src/alignment/alignment_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package alignment

import (
"fmt"
"io"
"log"
"os"
"testing"

Expand All @@ -18,12 +18,12 @@ var (
sigSize = 128
)

func loadGFA() (*gfa.GFA, error) {
func loadGFA() *gfa.GFA {
// load the GFA file
fh, err := os.Open(inputFile)
reader, err := gfa.NewReader(fh)
if err != nil {
return nil, fmt.Errorf("can't read gfa file: %v", err)
log.Fatalf("can't read gfa file: %v", err)
}
// collect the GFA instance
myGFA := reader.CollectGFA()
Expand All @@ -34,13 +34,13 @@ func loadGFA() (*gfa.GFA, error) {
break
}
if err != nil {
return nil, fmt.Errorf("error reading line in gfa file: %v", err)
log.Fatalf("error reading line in gfa file: %v", err)
}
if err := line.Add(myGFA); err != nil {
return nil, fmt.Errorf("error adding line to GFA instance: %v", err)
log.Fatalf("error adding line to GFA instance: %v", err)
}
}
return myGFA, nil
return myGFA
}

func setupMultimapRead() (*seqio.FASTQread, error) {
Expand Down Expand Up @@ -80,16 +80,13 @@ func TestExactMatchMultiMapper(t *testing.T) {
// create the read
testRead, err := setupMultimapRead()
if err != nil {
t.Fatal(err)
log.Fatal(err)
}
// create the GrootGraph and graphStore
myGFA, err := loadGFA()
if err != nil {
t.Fatal(err)
}
myGFA := loadGFA()
grootGraph, err := graph.CreateGrootGraph(myGFA, 1)
if err != nil {
t.Fatal(err)
log.Fatal(err)
}
graphStore := make(graph.GraphStore)
graphStore[grootGraph.GraphID] = grootGraph
Expand All @@ -116,16 +113,13 @@ func TestExactMatchUniqMapper(t *testing.T) {
// create the read
testRead, err := setupUniqmapRead()
if err != nil {
t.Fatal(err)
log.Fatal(err)
}
// create the GrootGraph and graphStore
myGFA, err := loadGFA()
if err != nil {
t.Fatal(err)
}
myGFA := loadGFA()
grootGraph, err := graph.CreateGrootGraph(myGFA, 1)
if err != nil {
t.Fatal(err)
log.Fatal(err)
}
graphStore := make(graph.GraphStore)
graphStore[grootGraph.GraphID] = grootGraph
Expand Down
25 changes: 11 additions & 14 deletions src/graph/graph.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ package graph
import (
"bytes"
"encoding/binary"
"encoding/gob"
"fmt"
"io/ioutil"
"os"
"sort"
"strconv"
Expand All @@ -14,8 +14,9 @@ import (

"github.com/biogo/hts/sam"
"github.com/will-rowe/gfa"
"github.com/will-rowe/groot/src/seqio"
"github.com/will-rowe/groot/src/misc"
"github.com/will-rowe/groot/src/seqio"
"gopkg.in/vmihailenco/msgpack.v2"
)

/*
Expand Down Expand Up @@ -430,24 +431,20 @@ type GraphStore map[int]*GrootGraph

// Dump is a method to save a GrootGraph to file
func (graphStore *GraphStore) Dump(path string) error {
file, err := os.Create(path)
if err == nil {
encoder := gob.NewEncoder(file)
encoder.Encode(graphStore)
b, err := msgpack.Marshal(graphStore)
if err != nil {
return err
}
file.Close()
return err
return ioutil.WriteFile(path, b, 0644)
}

// Load is a method to load a GrootGraph from file
func (graphStore *GraphStore) Load(path string) error {
file, err := os.Open(path)
if err == nil {
decoder := gob.NewDecoder(file)
err = decoder.Decode(graphStore)
b, err := ioutil.ReadFile(path)
if err != nil {
return err
}
file.Close()
return err
return msgpack.Unmarshal(b, graphStore)
}

// GetRefs is a method to convert all paths held in graphStore to sam.References
Expand Down
Loading

0 comments on commit 92ead3b

Please sign in to comment.