Skip to content

Linear Regression 10M

A linear regression plot predicting flight arrival delay based on the time of departure, over 10 million flight records. Regression computation is performed in the database, with optimized selection updates using pre-aggregated materialized views. The area around a regression line shows a 95% confidence interval. Select a region to view regression results for a data subset.

Loading Example...

Specification

js
import * as vg from "@uwdata/vgplot";

await vg.coordinator().exec([
  `CREATE TABLE IF NOT EXISTS flights10m AS SELECT GREATEST(-60, LEAST(ARR_DELAY, 180))::DOUBLE AS delay, DISTANCE AS distance, DEP_TIME AS time FROM 'https://idl.uw.edu/mosaic-datasets/data/flights-10m.parquet'`,
  `CREATE TABLE IF NOT EXISTS flights10p AS SELECT * FROM flights10m USING SAMPLE 10%`,
  `CREATE TABLE IF NOT EXISTS flights5p AS SELECT * FROM flights10m USING SAMPLE 5%`,
  `CREATE TABLE IF NOT EXISTS flights1p AS SELECT * FROM flights10m USING SAMPLE 1%`
]);

const $data = vg.Param.value("flights10m");
const $query = vg.Selection.intersect();

export default vg.vconcat(
  vg.menu({
    label: "Sample",
    as: $data,
    options: [
    {value: "flights10m", label: "Full Data"},
    {value: "flights10p", label: "10% Sample"},
    {value: "flights5p", label: "5% Sample"},
    {value: "flights1p", label: "1% Sample"}
  ]
  }),
  vg.vspace(10),
  vg.plot(
    vg.raster(
      vg.from($data),
      {x: "time", y: "delay", pixelSize: 4, pad: 0, imageRendering: "pixelated"}
    ),
    vg.regressionY(
      vg.from($data),
      {x: "time", y: "delay", stroke: "gray"}
    ),
    vg.regressionY(
      vg.from($data, {filterBy: $query}),
      {x: "time", y: "delay", stroke: "firebrick"}
    ),
    vg.intervalXY({as: $query, brush: {fillOpacity: 0, stroke: "currentColor"}}),
    vg.xDomain([0, 24]),
    vg.yDomain([-60, 180]),
    vg.colorScale("symlog"),
    vg.colorScheme("blues"),
    vg.colorDomain(vg.Fixed)
  )
);
yaml
meta:
  title: Linear Regression 10M
  description: >
    A linear regression plot predicting flight arrival delay based on
    the time of departure, over 10 million flight records.
    Regression computation is performed in the database, with optimized
    selection updates using pre-aggregated materialized views.
    The area around a regression line shows a 95% confidence interval.
    Select a region to view regression results for a data subset.
data:
  flights10m: SELECT
    GREATEST(-60, LEAST(ARR_DELAY, 180))::DOUBLE AS delay,
    DISTANCE AS distance,
    DEP_TIME AS time
    FROM 'https://idl.uw.edu/mosaic-datasets/data/flights-10m.parquet'
  flights10p: SELECT * FROM flights10m USING SAMPLE 10%
  flights5p: SELECT * FROM flights10m USING SAMPLE 5%
  flights1p: SELECT * FROM flights10m USING SAMPLE 1%
params:
  data: flights10m
vconcat:
- input: menu
  label: Sample
  as: $data
  options:
    - { value: flights10m, label: Full Data }
    - { value: flights10p, label: 10% Sample }
    - { value: flights5p, label: 5% Sample }
    - { value: flights1p, label: 1% Sample }
- vspace: 10
- plot:
  - mark: raster
    data: { from: $data }
    x: time
    y: delay
    pixelSize: 4
    pad: 0
    imageRendering: pixelated
  - mark: regressionY
    data: { from: $data }
    x: time
    y: delay
    stroke: gray
  - mark: regressionY
    data: { from: $data, filterBy: $query }
    x: time
    y: delay
    stroke: firebrick
  - select: intervalXY
    as: $query
    brush: { fillOpacity: 0, stroke: currentColor }
  xDomain: [0, 24]
  yDomain: [-60, 180]
  colorScale: symlog
  colorScheme: blues
  colorDomain: Fixed
json
{
  "meta": {
    "title": "Linear Regression 10M",
    "description": "A linear regression plot predicting flight arrival delay based on the time of departure, over 10 million flight records. Regression computation is performed in the database, with optimized selection updates using pre-aggregated materialized views. The area around a regression line shows a 95% confidence interval. Select a region to view regression results for a data subset.\n"
  },
  "data": {
    "flights10m": "SELECT GREATEST(-60, LEAST(ARR_DELAY, 180))::DOUBLE AS delay, DISTANCE AS distance, DEP_TIME AS time FROM 'https://idl.uw.edu/mosaic-datasets/data/flights-10m.parquet'",
    "flights10p": "SELECT * FROM flights10m USING SAMPLE 10%",
    "flights5p": "SELECT * FROM flights10m USING SAMPLE 5%",
    "flights1p": "SELECT * FROM flights10m USING SAMPLE 1%"
  },
  "params": {
    "data": "flights10m"
  },
  "vconcat": [
    {
      "input": "menu",
      "label": "Sample",
      "as": "$data",
      "options": [
        {
          "value": "flights10m",
          "label": "Full Data"
        },
        {
          "value": "flights10p",
          "label": "10% Sample"
        },
        {
          "value": "flights5p",
          "label": "5% Sample"
        },
        {
          "value": "flights1p",
          "label": "1% Sample"
        }
      ]
    },
    {
      "vspace": 10
    },
    {
      "plot": [
        {
          "mark": "raster",
          "data": {
            "from": "$data"
          },
          "x": "time",
          "y": "delay",
          "pixelSize": 4,
          "pad": 0,
          "imageRendering": "pixelated"
        },
        {
          "mark": "regressionY",
          "data": {
            "from": "$data"
          },
          "x": "time",
          "y": "delay",
          "stroke": "gray"
        },
        {
          "mark": "regressionY",
          "data": {
            "from": "$data",
            "filterBy": "$query"
          },
          "x": "time",
          "y": "delay",
          "stroke": "firebrick"
        },
        {
          "select": "intervalXY",
          "as": "$query",
          "brush": {
            "fillOpacity": 0,
            "stroke": "currentColor"
          }
        }
      ],
      "xDomain": [
        0,
        24
      ],
      "yDomain": [
        -60,
        180
      ],
      "colorScale": "symlog",
      "colorScheme": "blues",
      "colorDomain": "Fixed"
    }
  ]
}