Skip to content

Linear Regression 10M

A linear regression plot predicting flight arrival delay based on the time of departure, over 10 million flight records. Regression computation is performed in the database, with optimized selection updates using data cube indexes. The area around a regression line shows a 95% confidence interval. Select a region to view regression results for a data subset.

Loading Example...

Specification

js
import * as vg from "@uwdata/vgplot";

await vg.coordinator().exec([
  `CREATE TEMP TABLE IF NOT EXISTS flights10m AS SELECT GREATEST(-60, LEAST(ARR_DELAY, 180))::DOUBLE AS delay, DISTANCE AS distance, DEP_TIME AS time FROM 'https://idl.uw.edu/mosaic-datasets/data/flights-10m.parquet'`
]);

const $query = vg.Selection.intersect();

export default vg.plot(
  vg.raster(
    vg.from("flights10m"),
    {x: "time", y: "delay", pixelSize: 4, pad: 0, imageRendering: "pixelated"}
  ),
  vg.regressionY(
    vg.from("flights10m"),
    {x: "time", y: "delay", stroke: "gray"}
  ),
  vg.regressionY(
    vg.from("flights10m", {filterBy: $query}),
    {x: "time", y: "delay", stroke: "firebrick"}
  ),
  vg.intervalXY({as: $query, brush: {fillOpacity: 0, stroke: "currentColor"}}),
  vg.xDomain([0, 24]),
  vg.yDomain([-60, 180]),
  vg.colorScale("symlog"),
  vg.colorScheme("blues")
);
yaml
meta:
  title: Linear Regression 10M
  description: >
    A linear regression plot predicting flight arrival delay based on
    the time of departure, over 10 million flight records.
    Regression computation is performed in the database, with optimized
    selection updates using data cube indexes.
    The area around a regression line shows a 95% confidence interval.
    Select a region to view regression results for a data subset.
data:
  flights10m: SELECT
    GREATEST(-60, LEAST(ARR_DELAY, 180))::DOUBLE AS delay,
    DISTANCE AS distance,
    DEP_TIME AS time
    FROM 'https://idl.uw.edu/mosaic-datasets/data/flights-10m.parquet'
plot:
- mark: raster
  data: { from: flights10m }
  x: time
  y: delay
  pixelSize: 4
  pad: 0
  imageRendering: pixelated
- mark: regressionY
  data: { from: flights10m }
  x: time
  y: delay
  stroke: gray
- mark: regressionY
  data: { from: flights10m, filterBy: $query }
  x: time
  y: delay
  stroke: firebrick
- select: intervalXY
  as: $query
  brush: { fillOpacity: 0, stroke: currentColor }
xDomain: [0, 24]
yDomain: [-60, 180]
colorScale: symlog
colorScheme: blues
json
{
  "meta": {
    "title": "Linear Regression 10M",
    "description": "A linear regression plot predicting flight arrival delay based on the time of departure, over 10 million flight records. Regression computation is performed in the database, with optimized selection updates using data cube indexes. The area around a regression line shows a 95% confidence interval. Select a region to view regression results for a data subset.\n"
  },
  "data": {
    "flights10m": "SELECT GREATEST(-60, LEAST(ARR_DELAY, 180))::DOUBLE AS delay, DISTANCE AS distance, DEP_TIME AS time FROM 'https://idl.uw.edu/mosaic-datasets/data/flights-10m.parquet'"
  },
  "plot": [
    {
      "mark": "raster",
      "data": {
        "from": "flights10m"
      },
      "x": "time",
      "y": "delay",
      "pixelSize": 4,
      "pad": 0,
      "imageRendering": "pixelated"
    },
    {
      "mark": "regressionY",
      "data": {
        "from": "flights10m"
      },
      "x": "time",
      "y": "delay",
      "stroke": "gray"
    },
    {
      "mark": "regressionY",
      "data": {
        "from": "flights10m",
        "filterBy": "$query"
      },
      "x": "time",
      "y": "delay",
      "stroke": "firebrick"
    },
    {
      "select": "intervalXY",
      "as": "$query",
      "brush": {
        "fillOpacity": 0,
        "stroke": "currentColor"
      }
    }
  ],
  "xDomain": [
    0,
    24
  ],
  "yDomain": [
    -60,
    180
  ],
  "colorScale": "symlog",
  "colorScheme": "blues"
}