feat(group-layers): Finish layering algorithm implementation

This commit adds the actual logic for extracting layer groupings and merging them until the layer budget is satisfied. The implementation conforms to the design doc as of the time of this commit.
2019-08-12 01:41:17 +01:00 · 2019-08-12 01:41:17 +01:00 · 56a426952c
commit 56a426952c
parent 590ce994bb
1 changed files with 103 additions and 58 deletions
--- a/tools/nixery/group-layers/group-layers.go
+++ b/tools/nixery/group-layers/group-layers.go
@ -65,12 +65,11 @@
 //
 // If the list of layers fits within the layer budget, it is returned.
 //
-// Otherwise layers are merged together in this order:
+// Otherwise, a merge rating is calculated for each layer. This is the
+// product of the layer's total size and its root node's popularity.
 //
-// * layers whose root meets neither condition above
-// * layers whose root is popular
-// * layers whose root is big
-// * layers whose root meets both conditions
+// Layers are then merged in ascending order of merge ratings until
+// they fit into the layer budget.
 //
 // # Threshold values
 //
@ -109,10 +108,10 @@ import (
 	"io/ioutil"
 	"log"
 	"regexp"
+	"sort"

-	"gonum.org/v1/gonum/graph/simple"
 	"gonum.org/v1/gonum/graph/flow"
-	"gonum.org/v1/gonum/graph/encoding/dot"
+	"gonum.org/v1/gonum/graph/simple"
 )

 // closureGraph represents the structured attributes Nix outputs when asking it
@ -123,7 +122,7 @@ type exportReferences struct {
 	} `json:"exportReferencesGraph"`

 	Graph []struct {
-		Size uint64 `json:"closureSize`
+		Size uint64   `json:"closureSize"`
 		Path string   `json:"path"`
 		Refs []string `json:"references"`
 	} `json:"graph"`
@ -136,6 +135,19 @@ type exportReferences struct {
 // of the nixpkgs tree.
 type pkgsMetadata = map[string]int

+// layer represents the data returned for each layer that Nix should
+// build for the container image.
+type layer struct {
+	Contents    []string `json:"contents"`
+	mergeRating uint64
+}
+
+func (a layer) merge(b layer) layer {
+	a.Contents = append(a.Contents, b.Contents...)
+	a.mergeRating += b.mergeRating
+	return a
+}
+
 // closure as pointed to by the graph nodes.
 type closure struct {
 	GraphID    int64
@ -143,7 +155,6 @@ type closure struct {
 	Size       uint64
 	Refs       []string
 	Popularity int
-	// TODO(tazjin): popularity and other funny business
 }

 func (c *closure) ID() int64 {
@ -151,6 +162,7 @@ func (c *closure) ID() int64 {
 }

 var nixRegexp = regexp.MustCompile(`^/nix/store/[a-z0-9]+-`)
+
 func (c *closure) DOTID() string {
 	return nixRegexp.ReplaceAllString(c.Path, "")
 }
@ -158,29 +170,30 @@ func (c *closure) DOTID() string {
 // bigOrPopular checks whether this closure should be considered for
 // separation into its own layer, even if it would otherwise only
 // appear in a subtree of the dominator tree.
-func (c *closure) bigOrPopular(pkgs *pkgsMetadata) bool {
+func (c *closure) bigOrPopular() bool {
 	const sizeThreshold = 100 * 1000000 // 100MB

 	if c.Size > sizeThreshold {
 		return true
 	}

-	// TODO(tazjin): After generating the full data, this should
-	// be changed to something other than a simple inclusion
-	// (currently the test-data only contains the top 200
-	// packages).
-	pop, ok := (*pkgs)[c.DOTID()]
-	if ok {
-		log.Printf("%q is popular!\n", c.DOTID())
+	// The threshold value used here is currently roughly the
+	// minimum number of references that only 1% of packages in
+	// the entire package set have.
+	//
+	// TODO(tazjin): Do this more elegantly by calculating
+	// percentiles for each package and using those instead.
+	if c.Popularity >= 1000 {
+		return true
 	}
-	c.Popularity = pop
-	return ok
+
+	return false
 }

-func insertEdges(graph *simple.DirectedGraph, pop *pkgsMetadata, cmap *map[string]*closure, node *closure) {
+func insertEdges(graph *simple.DirectedGraph, cmap *map[string]*closure, node *closure) {
 	// Big or popular nodes get a separate edge from the top to
 	// flag them for their own layer.
-	if node.bigOrPopular(pop) && !graph.HasEdgeFromTo(0, node.ID()) {
+	if node.bigOrPopular() && !graph.HasEdgeFromTo(0, node.ID()) {
 		edge := graph.NewEdge(graph.Node(0), node)
 		graph.SetEdge(edge)
 	}
@ -205,20 +218,26 @@ func buildGraph(refs *exportReferences, pop *pkgsMetadata) *simple.DirectedGraph
 	//
 	// A map from store paths to IDs is kept to actually insert
 	// edges below.
-	root := &closure {
+	root := &closure{
 		GraphID: 0,
 		Path:    "image_root",
 	}
 	graph.AddNode(root)

 	for idx, c := range refs.Graph {
-		node := &closure {
+		node := &closure{
 			GraphID: int64(idx + 1), // inc because of root node
 			Path:    c.Path,
 			Size:    c.Size,
 			Refs:    c.Refs,
 		}

+		if p, ok := (*pop)[node.DOTID()]; ok {
+			node.Popularity = p
+		} else {
+			node.Popularity = 1
+		}
+
 		graph.AddNode(node)
 		cmap[c.Path] = node
 	}
@ -231,49 +250,74 @@ func buildGraph(refs *exportReferences, pop *pkgsMetadata) *simple.DirectedGraph
 	}

 	for _, c := range cmap {
-		insertEdges(graph, pop, &cmap, c)
+		insertEdges(graph, &cmap, c)
 	}

-	// gv, err := dot.Marshal(graph, "deps", "", "")
-	// if err != nil {
-	// 	log.Fatalf("Could not encode graph: %s\n", err)
-	// }
-	// fmt.Print(string(gv))
-	// os.Exit(0)
-
 	return graph
 }

+// Extracts a subgraph starting at the specified root from the
+// dominator tree. The subgraph is converted into a flat list of
+// layers, each containing the store paths and merge rating.
+func groupLayer(dt *flow.DominatorTree, root *closure) layer {
+	size := root.Size
+	contents := []string{root.Path}
+	children := dt.DominatedBy(root.ID())
+
+	// This iteration does not use 'range' because the list being
+	// iterated is modified during the iteration (yes, I'm sorry).
+	for i := 0; i < len(children); i++ {
+		child := children[i].(*closure)
+		size += child.Size
+		contents = append(contents, child.Path)
+		children = append(children, dt.DominatedBy(child.ID())...)
+	}
+
+	return layer{
+		Contents: contents,
+		// TODO(tazjin): The point of this is to factor in
+		// both the size and the popularity when making merge
+		// decisions, but there might be a smarter way to do
+		// it than a plain multiplication.
+		mergeRating: uint64(root.Popularity) * size,
+	}
+}
+
 // Calculate the dominator tree of the entire package set and group
 // each top-level subtree into a layer.
-func dominate(graph *simple.DirectedGraph) {
+//
+// Layers are merged together until they fit into the layer budget,
+// based on their merge rating.
+func dominate(budget int, graph *simple.DirectedGraph) []layer {
 	dt := flow.Dominators(graph.Node(0), graph)

-	// convert dominator tree back into encodable graph
-	dg := simple.NewDirectedGraph()
-
-	for nodes := graph.Nodes(); nodes.Next(); {
-		dg.AddNode(nodes.Node())
+	var layers []layer
+	for _, n := range dt.DominatedBy(dt.Root().ID()) {
+		layers = append(layers, groupLayer(&dt, n.(*closure)))
 	}

-	for nodes := dg.Nodes(); nodes.Next(); {
-		node := nodes.Node()
-		for _, child := range dt.DominatedBy(node.ID()) {
-			edge := dg.NewEdge(node, child)
-			dg.SetEdge(edge)
-		}
+	sort.Slice(layers, func(i, j int) bool {
+		return layers[i].mergeRating < layers[j].mergeRating
+	})
+
+	if len(layers) > budget {
+		log.Printf("Ideal image has %v layers, but budget is %v\n", len(layers), budget)
 	}

-	gv, err := dot.Marshal(dg, "deps", "", "")
-	if err != nil {
-		log.Fatalf("Could not encode graph: %s\n", err)
+	for len(layers) > budget {
+		merged := layers[0].merge(layers[1])
+		layers[1] = merged
+		layers = layers[1:]
 	}
-	ioutil.WriteFile("graph.dot", gv, 0644)
+
+	return layers
 }

 func main() {
 	graphFile := flag.String("graph", ".attrs.json", "Input file containing graph")
 	popFile := flag.String("pop", "popularity.json", "Package popularity data")
+	outFile := flag.String("out", "layers.json", "File to write layers to")
+	layerBudget := flag.Int("budget", 94, "Total layer budget available")
 	flag.Parse()

 	// Parse graph data
@ -300,8 +344,9 @@ func main() {
 		log.Fatalf("Failed to deserialise input: %s\n", err)
 	}

-	log.Printf("%v\n", pop)
-
 	graph := buildGraph(&refs, &pop)
-	dominate(graph)
+	layers := dominate(*layerBudget, graph)
+
+	j, _ := json.Marshal(layers)
+	ioutil.WriteFile(*outFile, j, 0644)
 }