feat(group-layers): Finish layering algorithm implementation

This commit adds the actual logic for extracting layer groupings and
merging them until the layer budget is satisfied.

The implementation conforms to the design doc as of the time of this
commit.
This commit is contained in:
Vincent Ambo 2019-08-12 01:41:17 +01:00 committed by Vincent Ambo
parent 590ce994bb
commit 56a426952c

View file

@ -65,12 +65,11 @@
// //
// If the list of layers fits within the layer budget, it is returned. // If the list of layers fits within the layer budget, it is returned.
// //
// Otherwise layers are merged together in this order: // Otherwise, a merge rating is calculated for each layer. This is the
// product of the layer's total size and its root node's popularity.
// //
// * layers whose root meets neither condition above // Layers are then merged in ascending order of merge ratings until
// * layers whose root is popular // they fit into the layer budget.
// * layers whose root is big
// * layers whose root meets both conditions
// //
// # Threshold values // # Threshold values
// //
@ -109,10 +108,10 @@ import (
"io/ioutil" "io/ioutil"
"log" "log"
"regexp" "regexp"
"sort"
"gonum.org/v1/gonum/graph/simple"
"gonum.org/v1/gonum/graph/flow" "gonum.org/v1/gonum/graph/flow"
"gonum.org/v1/gonum/graph/encoding/dot" "gonum.org/v1/gonum/graph/simple"
) )
// closureGraph represents the structured attributes Nix outputs when asking it // closureGraph represents the structured attributes Nix outputs when asking it
@ -123,7 +122,7 @@ type exportReferences struct {
} `json:"exportReferencesGraph"` } `json:"exportReferencesGraph"`
Graph []struct { Graph []struct {
Size uint64 `json:"closureSize` Size uint64 `json:"closureSize"`
Path string `json:"path"` Path string `json:"path"`
Refs []string `json:"references"` Refs []string `json:"references"`
} `json:"graph"` } `json:"graph"`
@ -136,14 +135,26 @@ type exportReferences struct {
// of the nixpkgs tree. // of the nixpkgs tree.
type pkgsMetadata = map[string]int type pkgsMetadata = map[string]int
// layer represents the data returned for each layer that Nix should
// build for the container image.
type layer struct {
Contents []string `json:"contents"`
mergeRating uint64
}
func (a layer) merge(b layer) layer {
a.Contents = append(a.Contents, b.Contents...)
a.mergeRating += b.mergeRating
return a
}
// closure as pointed to by the graph nodes. // closure as pointed to by the graph nodes.
type closure struct { type closure struct {
GraphID int64 GraphID int64
Path string Path string
Size uint64 Size uint64
Refs []string Refs []string
Popularity int Popularity int
// TODO(tazjin): popularity and other funny business
} }
func (c *closure) ID() int64 { func (c *closure) ID() int64 {
@ -151,6 +162,7 @@ func (c *closure) ID() int64 {
} }
var nixRegexp = regexp.MustCompile(`^/nix/store/[a-z0-9]+-`) var nixRegexp = regexp.MustCompile(`^/nix/store/[a-z0-9]+-`)
func (c *closure) DOTID() string { func (c *closure) DOTID() string {
return nixRegexp.ReplaceAllString(c.Path, "") return nixRegexp.ReplaceAllString(c.Path, "")
} }
@ -158,29 +170,30 @@ func (c *closure) DOTID() string {
// bigOrPopular checks whether this closure should be considered for // bigOrPopular checks whether this closure should be considered for
// separation into its own layer, even if it would otherwise only // separation into its own layer, even if it would otherwise only
// appear in a subtree of the dominator tree. // appear in a subtree of the dominator tree.
func (c *closure) bigOrPopular(pkgs *pkgsMetadata) bool { func (c *closure) bigOrPopular() bool {
const sizeThreshold = 100 * 1000000 // 100MB const sizeThreshold = 100 * 1000000 // 100MB
if c.Size > sizeThreshold { if c.Size > sizeThreshold {
return true return true
} }
// TODO(tazjin): After generating the full data, this should // The threshold value used here is currently roughly the
// be changed to something other than a simple inclusion // minimum number of references that only 1% of packages in
// (currently the test-data only contains the top 200 // the entire package set have.
// packages). //
pop, ok := (*pkgs)[c.DOTID()] // TODO(tazjin): Do this more elegantly by calculating
if ok { // percentiles for each package and using those instead.
log.Printf("%q is popular!\n", c.DOTID()) if c.Popularity >= 1000 {
return true
} }
c.Popularity = pop
return ok return false
} }
func insertEdges(graph *simple.DirectedGraph, pop *pkgsMetadata, cmap *map[string]*closure, node *closure) { func insertEdges(graph *simple.DirectedGraph, cmap *map[string]*closure, node *closure) {
// Big or popular nodes get a separate edge from the top to // Big or popular nodes get a separate edge from the top to
// flag them for their own layer. // flag them for their own layer.
if node.bigOrPopular(pop) && !graph.HasEdgeFromTo(0, node.ID()) { if node.bigOrPopular() && !graph.HasEdgeFromTo(0, node.ID()) {
edge := graph.NewEdge(graph.Node(0), node) edge := graph.NewEdge(graph.Node(0), node)
graph.SetEdge(edge) graph.SetEdge(edge)
} }
@ -205,18 +218,24 @@ func buildGraph(refs *exportReferences, pop *pkgsMetadata) *simple.DirectedGraph
// //
// A map from store paths to IDs is kept to actually insert // A map from store paths to IDs is kept to actually insert
// edges below. // edges below.
root := &closure { root := &closure{
GraphID: 0, GraphID: 0,
Path: "image_root", Path: "image_root",
} }
graph.AddNode(root) graph.AddNode(root)
for idx, c := range refs.Graph { for idx, c := range refs.Graph {
node := &closure { node := &closure{
GraphID: int64(idx + 1), // inc because of root node GraphID: int64(idx + 1), // inc because of root node
Path: c.Path, Path: c.Path,
Size: c.Size, Size: c.Size,
Refs: c.Refs, Refs: c.Refs,
}
if p, ok := (*pop)[node.DOTID()]; ok {
node.Popularity = p
} else {
node.Popularity = 1
} }
graph.AddNode(node) graph.AddNode(node)
@ -231,49 +250,74 @@ func buildGraph(refs *exportReferences, pop *pkgsMetadata) *simple.DirectedGraph
} }
for _, c := range cmap { for _, c := range cmap {
insertEdges(graph, pop, &cmap, c) insertEdges(graph, &cmap, c)
} }
// gv, err := dot.Marshal(graph, "deps", "", "")
// if err != nil {
// log.Fatalf("Could not encode graph: %s\n", err)
// }
// fmt.Print(string(gv))
// os.Exit(0)
return graph return graph
} }
// Extracts a subgraph starting at the specified root from the
// dominator tree. The subgraph is converted into a flat list of
// layers, each containing the store paths and merge rating.
func groupLayer(dt *flow.DominatorTree, root *closure) layer {
size := root.Size
contents := []string{root.Path}
children := dt.DominatedBy(root.ID())
// This iteration does not use 'range' because the list being
// iterated is modified during the iteration (yes, I'm sorry).
for i := 0; i < len(children); i++ {
child := children[i].(*closure)
size += child.Size
contents = append(contents, child.Path)
children = append(children, dt.DominatedBy(child.ID())...)
}
return layer{
Contents: contents,
// TODO(tazjin): The point of this is to factor in
// both the size and the popularity when making merge
// decisions, but there might be a smarter way to do
// it than a plain multiplication.
mergeRating: uint64(root.Popularity) * size,
}
}
// Calculate the dominator tree of the entire package set and group // Calculate the dominator tree of the entire package set and group
// each top-level subtree into a layer. // each top-level subtree into a layer.
func dominate(graph *simple.DirectedGraph) { //
// Layers are merged together until they fit into the layer budget,
// based on their merge rating.
func dominate(budget int, graph *simple.DirectedGraph) []layer {
dt := flow.Dominators(graph.Node(0), graph) dt := flow.Dominators(graph.Node(0), graph)
// convert dominator tree back into encodable graph var layers []layer
dg := simple.NewDirectedGraph() for _, n := range dt.DominatedBy(dt.Root().ID()) {
layers = append(layers, groupLayer(&dt, n.(*closure)))
for nodes := graph.Nodes(); nodes.Next(); {
dg.AddNode(nodes.Node())
} }
for nodes := dg.Nodes(); nodes.Next(); { sort.Slice(layers, func(i, j int) bool {
node := nodes.Node() return layers[i].mergeRating < layers[j].mergeRating
for _, child := range dt.DominatedBy(node.ID()) { })
edge := dg.NewEdge(node, child)
dg.SetEdge(edge) if len(layers) > budget {
} log.Printf("Ideal image has %v layers, but budget is %v\n", len(layers), budget)
} }
gv, err := dot.Marshal(dg, "deps", "", "") for len(layers) > budget {
if err != nil { merged := layers[0].merge(layers[1])
log.Fatalf("Could not encode graph: %s\n", err) layers[1] = merged
layers = layers[1:]
} }
ioutil.WriteFile("graph.dot", gv, 0644)
return layers
} }
func main() { func main() {
graphFile := flag.String("graph", ".attrs.json", "Input file containing graph") graphFile := flag.String("graph", ".attrs.json", "Input file containing graph")
popFile := flag.String("pop", "popularity.json", "Package popularity data") popFile := flag.String("pop", "popularity.json", "Package popularity data")
outFile := flag.String("out", "layers.json", "File to write layers to")
layerBudget := flag.Int("budget", 94, "Total layer budget available")
flag.Parse() flag.Parse()
// Parse graph data // Parse graph data
@ -300,8 +344,9 @@ func main() {
log.Fatalf("Failed to deserialise input: %s\n", err) log.Fatalf("Failed to deserialise input: %s\n", err)
} }
log.Printf("%v\n", pop)
graph := buildGraph(&refs, &pop) graph := buildGraph(&refs, &pop)
dominate(graph) layers := dominate(*layerBudget, graph)
j, _ := json.Marshal(layers)
ioutil.WriteFile(*outFile, j, 0644)
} }