2
votes

I am using dc.js and crossfilter.js to create a d3 dashboard, and am wondering how to implement a regression line into a scatterplot chart that responds to filtering.

I have been playing with a few examples re adding a regression line, but I have been unsuccessful extracting and incorporating the code.

I don't have a problem with the math, but rather with how to access the filtered data from the dimension, and then how to add the regression line to to the filtered scatterplot chart (so that the regression line also responds to future filtering).

jsFiddle Demo

var data = [
{"record":"record","date":"date","cars":"cars","bikes":"bikes"},
{"record":"1","date":"01/05/2012","cars":"1488.1","bikes":"49.73"},
{"record":"2","date":"02/05/2012","cars":"1374.29","bikes":"52.44"},
{"record":"3","date":"03/05/2012","cars":"1353.01","bikes":"47.92"},
{"record":"4","date":"04/05/2012","cars":"1420.33","bikes":"50.69"},
{"record":"5","date":"05/05/2012","cars":"1544.11","bikes":"47.47"},
{"record":"6","date":"06/05/2012","cars":"1292.84","bikes":"47.75"},
{"record":"7","date":"07/05/2012","cars":"1318.9","bikes":"48.64"},
{"record":"8","date":"08/05/2012","cars":"1686.3","bikes":"50.9"},
{"record":"9","date":"09/05/2012","cars":"1603.99","bikes":"53.44"},
{"record":"10","date":"10/05/2012","cars":"1420.1","bikes":"53.29"},
{"record":"11","date":"11/05/2012","cars":"1410.8","bikes":"54.06"},
{"record":"12","date":"12/05/2012","cars":"1374.62","bikes":"51.24"},
{"record":"13","date":"13/05/2012","cars":"1279.53","bikes":"53.96"},
{"record":"14","date":"14/05/2012","cars":"1330.47","bikes":"49.5"},
{"record":"15","date":"15/05/2012","cars":"1377.61","bikes":"52.32"},
{"record":"16","date":"16/05/2012","cars":"1302.12","bikes":"51.96"},
{"record":"17","date":"17/05/2012","cars":"1326.9","bikes":"49.86"},
{"record":"18","date":"18/05/2012","cars":"1181.55","bikes":"50.25"},
{"record":"19","date":"19/05/2012","cars":"1493.75","bikes":"51.24"},
{"record":"20","date":"20/05/2012","cars":"1463.9","bikes":"50.88"},
{"record":"21","date":"21/05/2012","cars":"1370.16","bikes":"51.09"},
{"record":"22","date":"22/05/2012","cars":"1403.3","bikes":"51.67"},
{"record":"23","date":"23/05/2012","cars":"1277.65","bikes":"49.3"},
{"record":"24","date":"24/05/2012","cars":"1361.94","bikes":"50.47"},
{"record":"25","date":"25/05/2012","cars":"1400.8","bikes":"51.55"},
{"record":"26","date":"26/05/2012","cars":"1289.09","bikes":"47.17"},
{"record":"27","date":"27/05/2012","cars":"1258.39","bikes":"52.12"},
{"record":"28","date":"28/05/2012","cars":"1288.71","bikes":"49.28"},
{"record":"29","date":"29/05/2012","cars":"1511.86","bikes":"50.73"},
{"record":"30","date":"30/05/2012","cars":"1300.38","bikes":"52.39"},
{"record":"31","date":"31/05/2012","cars":"1455.19","bikes":"49.53"},
{"record":"32","date":"01/06/2012","cars":"1311.89","bikes":"50.37"},
{"record":"33","date":"02/06/2012","cars":"1368.64","bikes":"50.87"},
{"record":"34","date":"03/06/2012","cars":"1360.05","bikes":"50.51"},
{"record":"35","date":"04/06/2012","cars":"1382.56","bikes":"49.67"},
{"record":"36","date":"05/06/2012","cars":"1304.15","bikes":"47.6"},
{"record":"37","date":"06/06/2012","cars":"1271.57","bikes":"50.22"},
{"record":"38","date":"07/06/2012","cars":"1442.38","bikes":"50.8"},
{"record":"39","date":"08/06/2012","cars":"1406.38","bikes":"53.14"},
{"record":"40","date":"09/06/2012","cars":"1724.16","bikes":"49.66"},
{"record":"41","date":"10/06/2012","cars":"1931.05","bikes":"53"},
{"record":"42","date":"11/06/2012","cars":"1669.47","bikes":"53.71"},
{"record":"43","date":"12/06/2012","cars":"1794.06","bikes":"51.78"},
{"record":"44","date":"13/06/2012","cars":"1625.98","bikes":"51.58"},
{"record":"45","date":"14/06/2012","cars":"1371.51","bikes":"52.36"},
{"record":"46","date":"15/06/2012","cars":"1418.05","bikes":"47.64"},
{"record":"47","date":"16/06/2012","cars":"1431","bikes":"53.14"},
{"record":"48","date":"17/06/2012","cars":"1527.21","bikes":"48.63"},
{"record":"49","date":"18/06/2012","cars":"1320.95","bikes":"51.7"},
{"record":"50","date":"19/06/2012","cars":"1396.93","bikes":"52.92"}
];
tSel1 = "cars";
tSel2 = "bikes";

data.forEach(function (d) {
	d[tSel1] = +d[tSel1];
	d[tSel2] = +d[tSel2];
});

var facts = crossfilter(data);

var allDimension = facts.groupAll();
var scatterDimension = facts.dimension(function(d) {return [+d[tSel1], +d[tSel2]];});
var scatterGroup = scatterDimension.group().reduceSum(function(d) { return d[tSel1]; });

var maxY1 = d3.max(data, function(d) {return d[tSel1]});
var maxY2 = d3.max(data, function(d) {return d[tSel2]});
var maxY1Plus = maxY1 + (maxY1 * 0.1);
var maxY2Plus = maxY2 + (maxY2 * 0.1);

var minY1 = d3.min(data, function(d) {return d[tSel1]});
var minY1Minus = minY1 * 0.9;
var minY2 = d3.min(data, function(d) {return d[tSel2]});
var minY2Minus = minY2 * 0.9;

xyScatterChart = dc.scatterPlot("#scatterPlot");
xyScatterChart	
	.width(600)
	.height(400)
	.margins({top: 20, right: 20, bottom: 20, left: 60})
	.dimension(scatterDimension)
	.group(scatterGroup)
	.symbolSize(6)
	.highlightedSize(15)
	.brushOn(false)
	.excludedOpacity(0.5)
	.excludedSize(5)
	.renderHorizontalGridLines(true)
	.renderVerticalGridLines(true)

	.x(d3.scale.linear().domain([minY1Minus,maxY1Plus]))
	.y(d3.scale.linear().domain([minY2Minus,maxY2Plus]));

dc.renderAll();
dc.redrawAll();
<link href="http://dc-js.github.io/dc.js/css/dc.css" rel="stylesheet"/>
<script src="http://dc-js.github.io/dc.js/js/d3.js"></script>
<script src="http://dc-js.github.io/dc.js/js/crossfilter.js"></script>
<script src="http://dc-js.github.io/dc.js/js/dc.js"></script>
<div id="scatterPlot"></div>

References:

https://groups.google.com/forum/#!topic/dc-js-user-group/HaQMegKa_U0

https://bl.ocks.org/ctufts/298bfe4b11989960eeeecc9394e9f118

2

2 Answers

4
votes

It would be awesome to include an example in dc.js, since this is something lots of people can use.

Maybe we can work together on that? I don't know the math but here's a simple way to use a composite chart to display a line on data calculated from an aggregated group.

First off, here's the composite chart with the old scatter plot embedded in it:

var composite = dc.compositeChart("#composite");
composite   
    .width(600)
    .height(400)
    .margins({top: 20, right: 20, bottom: 20, left: 60})
    .dimension(scatterDimension)
    .group(scatterGroup)
  .compose([
  dc.scatterPlot(composite)
    .symbolSize(6)
    .highlightedSize(15)
    .brushOn(false)
    .excludedOpacity(0.5)
    .excludedSize(5)
    .renderHorizontalGridLines(true)
    .renderVerticalGridLines(true),
  dc.lineChart(composite)
  .group(regressionGroup(scatterGroup))
])
    .x(d3.scale.linear().domain([minY1Minus,maxY1Plus]))
    .y(d3.scale.linear().domain([minY2Minus,maxY2Plus]));

Note that we're supplying the scatter group to both the composite and the scatter plot. That's just because the composite chart requires a group even though it doesn't actually use it.

We've moved the parameters that have to do with coordinates to the main (composite) chart, but everything that is specific to the scatter plot stays on it. We've also added a line chart to the composite, which uses a "fake group" based on the scatter group.

This fake group is particularly fake, but it should be enough to get you started. Since I don't have time to learn the math today, I'll just pretend that the first and last points are the regression:

function regressionGroup(group) {
  return {
    all: function() {
      var _all = group.all();
      var first, last;
      for(var i=0; i < _all.length; ++i) {
        var key = _all[i].key;
        if(!isNaN(key[0]) && !isNaN(key[1])) {
          var kv = {key: key[0], value: key[1]};
          if(!first)
            first = kv;
          last = kv;
        }
      }
      return [first, last];
    }
  };
}

As with all fake groups, the idea is to calculate some group-like data when the chart asks for it (and no sooner), based on another group. Here the calculation is not very interesting, because you know how to calculate a regression and I don't. You'll want to replace first and last and the for loop with a real calculation; all this is doing is checking for valid points and keeping the first and last ones that it finds.

Interestingly, the scatter plot takes data where the key contains both x and y coordinates, but the line chart takes data where the key is x and the value is y. That's why we have the transformation kv = {key: key[0], value: key[1]}

Postscript

Note that you'll run into a dc.js bug if you put the regression guide points outside of the domain - the stack mixin is too aggressive about clipping points to the domain. There is an easy, ugly workaround that seems to work in this case: tell the line chart it has an ordinal x scale even though it doesn't:

var composite = dc.compositeChart("#composite"),
  lineChart;
composite   
    .width(600)
  // ...
  .compose([
  // ...
  lineChart = dc.lineChart(composite)
  .group(regressionGroup(scatterGroup))
])
lineChart.isOrdinal = d3.functor(true);

Yuck! But it works! This hack probably only works inside a composite!

https://jsfiddle.net/gordonwoodhull/5tpcxov1/12/

1
votes

I have a fully functional example of regression. I was precisely doing it when I came here for help and I found your question. It requires regression.js (here).

This follows Gordon's excellent suggestion of a "fake group", which should really be called an inline group, or immediate group, or even group on-the-fly. Here is mine:

function myRegressionGroup(group, min, max, filter = false) {
  return {
    all: function() {
      var _all = group.all();
      var first, last;
      if(filter) reg = regression.linear(_all.filter(function(k,v) {if(k.key[0]) return k.key}).map((k,v) => k.key));
      else reg = regression.linear(_all.map((k,v) => k.key));
      first = reg.predict(min);
      last = reg.predict(max)
      return [{key:first[0], value: first[1]}, {key: last[0], value: last[1]}]
    }
  };
}

Please notice that this function requires a crossfilter group and also the min and max from the x-scale. Since you typically have these values calculated for your xScale, all it takes is reusing them here. This is because the function uses the extremes with the predict method to calculate the two points of the regression line.

The optional filter data wrangler is for you to decide whether to remove empty values on x or not.

@Gordon, how should I do in order to include my regression example in the Examples of using dc.js?