From 56a426952c5ce4e20b4673a0087e6a2c5604fdf5 Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Mon, 12 Aug 2019 01:41:17 +0100 Subject: [PATCH] feat(group-layers): Finish layering algorithm implementation This commit adds the actual logic for extracting layer groupings and merging them until the layer budget is satisfied. The implementation conforms to the design doc as of the time of this commit. --- tools/nixery/group-layers/group-layers.go | 161 ++++++++++++++-------- 1 file changed, 103 insertions(+), 58 deletions(-) diff --git a/tools/nixery/group-layers/group-layers.go b/tools/nixery/group-layers/group-layers.go index 7be89e4e6..93f2e520a 100644 --- a/tools/nixery/group-layers/group-layers.go +++ b/tools/nixery/group-layers/group-layers.go @@ -65,12 +65,11 @@ // // If the list of layers fits within the layer budget, it is returned. // -// Otherwise layers are merged together in this order: +// Otherwise, a merge rating is calculated for each layer. This is the +// product of the layer's total size and its root node's popularity. // -// * layers whose root meets neither condition above -// * layers whose root is popular -// * layers whose root is big -// * layers whose root meets both conditions +// Layers are then merged in ascending order of merge ratings until +// they fit into the layer budget. // // # Threshold values // @@ -109,10 +108,10 @@ import ( "io/ioutil" "log" "regexp" + "sort" - "gonum.org/v1/gonum/graph/simple" "gonum.org/v1/gonum/graph/flow" - "gonum.org/v1/gonum/graph/encoding/dot" + "gonum.org/v1/gonum/graph/simple" ) // closureGraph represents the structured attributes Nix outputs when asking it @@ -123,7 +122,7 @@ type exportReferences struct { } `json:"exportReferencesGraph"` Graph []struct { - Size uint64 `json:"closureSize` + Size uint64 `json:"closureSize"` Path string `json:"path"` Refs []string `json:"references"` } `json:"graph"` @@ -136,14 +135,26 @@ type exportReferences struct { // of the nixpkgs tree. type pkgsMetadata = map[string]int +// layer represents the data returned for each layer that Nix should +// build for the container image. +type layer struct { + Contents []string `json:"contents"` + mergeRating uint64 +} + +func (a layer) merge(b layer) layer { + a.Contents = append(a.Contents, b.Contents...) + a.mergeRating += b.mergeRating + return a +} + // closure as pointed to by the graph nodes. type closure struct { - GraphID int64 - Path string - Size uint64 - Refs []string + GraphID int64 + Path string + Size uint64 + Refs []string Popularity int - // TODO(tazjin): popularity and other funny business } func (c *closure) ID() int64 { @@ -151,6 +162,7 @@ func (c *closure) ID() int64 { } var nixRegexp = regexp.MustCompile(`^/nix/store/[a-z0-9]+-`) + func (c *closure) DOTID() string { return nixRegexp.ReplaceAllString(c.Path, "") } @@ -158,29 +170,30 @@ func (c *closure) DOTID() string { // bigOrPopular checks whether this closure should be considered for // separation into its own layer, even if it would otherwise only // appear in a subtree of the dominator tree. -func (c *closure) bigOrPopular(pkgs *pkgsMetadata) bool { +func (c *closure) bigOrPopular() bool { const sizeThreshold = 100 * 1000000 // 100MB if c.Size > sizeThreshold { return true } - // TODO(tazjin): After generating the full data, this should - // be changed to something other than a simple inclusion - // (currently the test-data only contains the top 200 - // packages). - pop, ok := (*pkgs)[c.DOTID()] - if ok { - log.Printf("%q is popular!\n", c.DOTID()) + // The threshold value used here is currently roughly the + // minimum number of references that only 1% of packages in + // the entire package set have. + // + // TODO(tazjin): Do this more elegantly by calculating + // percentiles for each package and using those instead. + if c.Popularity >= 1000 { + return true } - c.Popularity = pop - return ok + + return false } -func insertEdges(graph *simple.DirectedGraph, pop *pkgsMetadata, cmap *map[string]*closure, node *closure) { +func insertEdges(graph *simple.DirectedGraph, cmap *map[string]*closure, node *closure) { // Big or popular nodes get a separate edge from the top to // flag them for their own layer. - if node.bigOrPopular(pop) && !graph.HasEdgeFromTo(0, node.ID()) { + if node.bigOrPopular() && !graph.HasEdgeFromTo(0, node.ID()) { edge := graph.NewEdge(graph.Node(0), node) graph.SetEdge(edge) } @@ -205,18 +218,24 @@ func buildGraph(refs *exportReferences, pop *pkgsMetadata) *simple.DirectedGraph // // A map from store paths to IDs is kept to actually insert // edges below. - root := &closure { + root := &closure{ GraphID: 0, - Path: "image_root", + Path: "image_root", } graph.AddNode(root) for idx, c := range refs.Graph { - node := &closure { + node := &closure{ GraphID: int64(idx + 1), // inc because of root node - Path: c.Path, - Size: c.Size, - Refs: c.Refs, + Path: c.Path, + Size: c.Size, + Refs: c.Refs, + } + + if p, ok := (*pop)[node.DOTID()]; ok { + node.Popularity = p + } else { + node.Popularity = 1 } graph.AddNode(node) @@ -231,49 +250,74 @@ func buildGraph(refs *exportReferences, pop *pkgsMetadata) *simple.DirectedGraph } for _, c := range cmap { - insertEdges(graph, pop, &cmap, c) + insertEdges(graph, &cmap, c) } - // gv, err := dot.Marshal(graph, "deps", "", "") - // if err != nil { - // log.Fatalf("Could not encode graph: %s\n", err) - // } - // fmt.Print(string(gv)) - // os.Exit(0) - return graph } +// Extracts a subgraph starting at the specified root from the +// dominator tree. The subgraph is converted into a flat list of +// layers, each containing the store paths and merge rating. +func groupLayer(dt *flow.DominatorTree, root *closure) layer { + size := root.Size + contents := []string{root.Path} + children := dt.DominatedBy(root.ID()) + + // This iteration does not use 'range' because the list being + // iterated is modified during the iteration (yes, I'm sorry). + for i := 0; i < len(children); i++ { + child := children[i].(*closure) + size += child.Size + contents = append(contents, child.Path) + children = append(children, dt.DominatedBy(child.ID())...) + } + + return layer{ + Contents: contents, + // TODO(tazjin): The point of this is to factor in + // both the size and the popularity when making merge + // decisions, but there might be a smarter way to do + // it than a plain multiplication. + mergeRating: uint64(root.Popularity) * size, + } +} + // Calculate the dominator tree of the entire package set and group // each top-level subtree into a layer. -func dominate(graph *simple.DirectedGraph) { +// +// Layers are merged together until they fit into the layer budget, +// based on their merge rating. +func dominate(budget int, graph *simple.DirectedGraph) []layer { dt := flow.Dominators(graph.Node(0), graph) - // convert dominator tree back into encodable graph - dg := simple.NewDirectedGraph() - - for nodes := graph.Nodes(); nodes.Next(); { - dg.AddNode(nodes.Node()) + var layers []layer + for _, n := range dt.DominatedBy(dt.Root().ID()) { + layers = append(layers, groupLayer(&dt, n.(*closure))) } - for nodes := dg.Nodes(); nodes.Next(); { - node := nodes.Node() - for _, child := range dt.DominatedBy(node.ID()) { - edge := dg.NewEdge(node, child) - dg.SetEdge(edge) - } + sort.Slice(layers, func(i, j int) bool { + return layers[i].mergeRating < layers[j].mergeRating + }) + + if len(layers) > budget { + log.Printf("Ideal image has %v layers, but budget is %v\n", len(layers), budget) } - gv, err := dot.Marshal(dg, "deps", "", "") - if err != nil { - log.Fatalf("Could not encode graph: %s\n", err) + for len(layers) > budget { + merged := layers[0].merge(layers[1]) + layers[1] = merged + layers = layers[1:] } - ioutil.WriteFile("graph.dot", gv, 0644) + + return layers } func main() { graphFile := flag.String("graph", ".attrs.json", "Input file containing graph") popFile := flag.String("pop", "popularity.json", "Package popularity data") + outFile := flag.String("out", "layers.json", "File to write layers to") + layerBudget := flag.Int("budget", 94, "Total layer budget available") flag.Parse() // Parse graph data @@ -300,8 +344,9 @@ func main() { log.Fatalf("Failed to deserialise input: %s\n", err) } - log.Printf("%v\n", pop) - graph := buildGraph(&refs, &pop) - dominate(graph) + layers := dominate(*layerBudget, graph) + + j, _ := json.Marshal(layers) + ioutil.WriteFile(*outFile, j, 0644) }