Commit 11757cd6 authored by Patrick Joan Thomson's avatar Patrick Joan Thomson
Browse files

Add monthly aggregation and improve speed

parent acf221e3
......@@ -4,8 +4,11 @@ CREATE SCHEMA api;
-- Create tables for API
CREATE MATERIALIZED VIEW api.bike_results_agg_hour AS SELECT * FROM bikeshare.results_agg_hour;
CREATE MATERIALIZED VIEW api.bike_results_agg_month AS SELECT * FROM bikeshare.results_agg_month;
--CREATE MATERIALIZED VIEW api.city_results_agg_hour AS SELECT * FROM city_bus.results_agg_hour; --Commented out because there is no sample data
--CREATE MATERIALIZED VIEW api.county_results_agg_hour AS SELECT * FROM county_bus.results_agg_hour; --Commented out because there is no sample data
--CREATE MATERIALIZED VIEW api.city_results_agg_month AS SELECT * FROM city_bus.results_agg_month; --Commented out because there is no sample data
--CREATE MATERIALIZED VIEW api.county_results_agg_hour AS SELECT * FROM county_bus.results_agg_hour; --Commented out because there is no sample data
--CREATE MATERIALIZED VIEW api.county_results_agg_month AS SELECT * FROM county_bus.results_agg_month; --Commented out because there is no sample data
CREATE MATERIALIZED VIEW api.od_matrix AS SELECT * FROM mobile_data.od_matrix;
CREATE MATERIALIZED VIEW api.bike_results_agg_day AS
......
......@@ -53,6 +53,35 @@ CREATE TABLE bikeshare.results_agg_hour (
--and the rounding can be done in the API
);
--Creates a table which holds bike trips with start and end locations and groups them by month
DROP TABLE IF EXISTS bikeshare.trips_month;
CREATE TABLE bikeshare.trips_month (
month timestamp, --Aggregated by month
start_station_name varchar(50), --Name of start station
start_station_id int, --ID of start station
x_start double precision, --Latitude of start station
y_start double precision, --Longitude of start station
end_station_name varchar(50), --Name of end station
end_station_id int, --ID of end station
x_end double precision, --Latitude of end station
y_end double precision, --Longitude of end station
count int --Count of trips made in the specified month from start station to end station
);
--Creates a table that shows how many trips are made from one neighborhood
--to another based on the bikeshare data, grouped by month
DROP TABLE IF EXISTS bikeshare.results_agg_month;
CREATE TABLE bikeshare.results_agg_month (
month timestamp, --Aggregated by month
start_neighborhood_name varchar(50), --Name of start neighborhood
start_neighborhood_id int, --ID of start neighborhood
end_neighborhood_name varchar(50), --Name of end neighborhood
end_neighborhood_id int, --ID of end neighborhood
count float --Count of trips made in the specified month from start neighborhood to end neighborhood.
--This is a float since aggregating month counts to month can lead to a significant error in count if it's an int
--and the rounding can be done in the API
);
--========--
--CITY_BUS--
--========--
......@@ -82,6 +111,32 @@ CREATE TABLE city_bus.results_agg_hour (
--and the rounding can be done in the API
);
--Creates a table that shows how many trips are made from one neighborhood
--to another based on the city_bus data, grouped by month
DROP TABLE IF EXISTS city_bus.trips_month;
CREATE TABLE city_bus.trips_month (
month timestamp, --Aggregated by hour
start_bus_stop_name varchar(50), --Name of start bus stop
start_bus_stop_code varchar(50), --Code of start bus stop
end_bus_stop_name varchar(50), --Name of end bus stop
end_bus_stop_code varchar(50), --Code of end bus stop
count int --Count of trips made in the specified hour from start bus stop to end bus stop
);
--Creates a table that shows how many trips are made from one neighborhood
--to another based on the city_bus data, grouped by month
DROP TABLE IF EXISTS city_bus.results_agg_month;
CREATE TABLE city_bus.results_agg_month (
month timestamp, --Aggregated by hour
start_neighborhood_name varchar(50), --Name of start neighborhood
start_neighborhood_id varchar(50), --ID of start neighborhood
end_neighborhood_name varchar(50), --Name of end neighborhood
end_neighborhood_id varchar(50), --ID of end neighborhood
count float --Count of trips made in the specified hour from start neighborhood to end neighborhood.
--This is a float since aggregating hour counts to month can lead to a significant error in count if it's an int
--and the rounding can be done in the API
);
--==========--
--COUNTY_BUS--
--==========--
......@@ -111,6 +166,31 @@ CREATE TABLE county_bus.results_agg_hour(
--and the rounding can be done in the API
);
--Creates a table for countybus trips, grouped by month, start and end point
DROP TABLE IF EXISTS county_bus.trips_month;
CREATE TABLE county_bus.trips_month (
month timestamp, --Aggregated by month
start_bus_stop_name varchar(50), --Name of start bus stop
start_bus_stop_code varchar(50), --Code of start bus stop
end_bus_stop_name varchar(50), --Name of end bus stop
count int --Count of trips made in the specified month from start bus stop to end bus stop
);
--Creates a table that shows how many trips are made from one territorial community
--to another based on the county_bus data, grouped by month
DROP TABLE IF EXISTS county_bus.results_agg_month;
CREATE TABLE county_bus.results_agg_month(
month timestamp, --Aggregated by month
start_tc_name varchar(50), --Name of start territorial community
start_tc_code int, --Code of start territorial community
end_tc_name varchar(50), --Name of end territorial community
end_tc_code int, --Code of end territorial community
count float --Count of trips made in the specified month from start territorial community to end territorial community.
--This is a float since aggregating month counts to month can lead to a significant error in count if it's an int
--and the rounding can be done in the API
);
--==========--
--POPULATION--
--==========--
......
......@@ -162,6 +162,93 @@ GROUP BY
ORDER BY
hour;
--Emptying table before inserting
TRUNCATE bikeshare.trips_month CASCADE;
\echo '\nInserting into bikeshare.trips_month...\n'
--Origin-Destination matrix for bikeshare counts by groups of month, start and end station
--Coordinates are in 3301
INSERT INTO bikeshare.trips_month
SELECT
DATE_TRUNC('month', a.unlocked_at) AS month,
a.start_station_name,
a.start_station_id::int AS start_id,
ST_X(b.geom) AS x_start, --Start station latitude
ST_Y(b.geom) AS y_start, --Start station longitude
a.end_station_name,
a.end_station_id::int AS end_id,
ST_X(c.geom) AS x_end, --Start station latitude
ST_Y(c.geom) AS y_end, --Start station longitude
COUNT(*) AS count --Count of trips made from start to end station in the specified month
FROM
bikeshare.valid_data AS a,
spatial_reference.bike_stops AS b,
spatial_reference.bike_stops AS c
WHERE
b.id = a.start_station_id AND
c.id = a.end_station_id
GROUP BY
month,
start_station_name,
start_id,
x_start,
y_start,
end_station_name,
end_id,
x_end,
y_end
ORDER BY
month;
--Emptying table before inserting
TRUNCATE bikeshare.results_agg_month CASCADE;
\echo '\nInserting into bikeshare.results_agg_month...\n'
--Shows trips made from one neighborhood to another in the span of one month based on bikeshare data
INSERT INTO bikeshare.results_agg_month
WITH
--Matching the bikeshare trips and stations with the respective neighborhoods
results_with_neighborhoods_not_aggregated AS (
SELECT
a.month AS month,
b.nimi AS start_nimi,
b.asum AS start_neighborhood_name,
b.share AS start_share,
b.neighborhood_id::integer AS start_neighborhood_id,
c.nimi AS end_nimi,
c.asum AS end_neighborhood_name,
c.share AS end_share,
c.neighborhood_id::integer AS end_neighborhood_id,
count::float AS count_stops
FROM
bikeshare.trips_month AS a,
spatial_reference.bike_stops_neighborhood AS b,
spatial_reference.bike_stops_neighborhood AS c
WHERE
a.start_station_id = b.bikestop_id::integer AND
a.end_station_id = c.bikestop_id::integer
)
SELECT
month,
start_neighborhood_name,
start_neighborhood_id,
end_neighborhood_name,
end_neighborhood_id,
SUM(start_share*end_share*count_stops) AS count --Sum per month is a float
FROM
results_with_neighborhoods_not_aggregated
WHERE
start_neighborhood_id != end_neighborhood_id
GROUP BY
month,
start_neighborhood_name,
start_neighborhood_id,
end_neighborhood_name,
end_neighborhood_id
ORDER BY
month;
\echo '\nDropping Bikeshare temporary tables...\n'
--Dropping temporary tables
......
......@@ -301,6 +301,8 @@ CREATE INDEX next_bus_stop_name_n_idx ON city_bus.no_card_id_records (next_bus_s
CREATE INDEX next_bus_stop_code_n_idx ON city_bus.no_card_id_records (next_bus_stop_code); */
\echo '\nInserting into city_bus.trips_hour...\n'
--Clearing trips_hour table to make way for new data
TRUNCATE city_bus.trips_hour CASCADE;
--Aggregate on stop level by hour
INSERT INTO city_bus.trips_hour
SELECT
......@@ -327,7 +329,8 @@ GROUP BY
ORDER BY
hour ASC;
--Clearing results_agg_hour table to make way for new data
TRUNCATE city_bus.results_agg_hour CASCADE;
\echo '\nInserting into city_bus.results_agg_hour...\n'
--Aggregate hourly data to neigborhood level
INSERT INTO city_bus.results_agg_hour
......@@ -371,6 +374,82 @@ GROUP BY
ORDER BY
hour;
\echo '\nInserting into city_bus.trips_month...\n'
--Clearing trips_month table to make way for new data
TRUNCATE city_bus.trips_month CASCADE;
--Aggregate on stop level by month
INSERT INTO city_bus.trips_month
SELECT
DATE_TRUNC('month', validation_time) AS month,
bus_stop_name AS start_bus_stop_name,
bus_stop_code AS start_bus_stop_code,
next_bus_stop_name AS end_bus_stop_name,
next_bus_stop_code AS end_bus_stop_code,
COUNT(*) AS count
FROM
city_bus.valid_records
WHERE
next_bus_stop_name IS NOT NULL OR
next_bus_stop_code IS NOT NULL AND
DATE(validation_time) = DATE(next_validation_time) --This selects records that have the next validation time in the same day as the original validation.
--This might leave out some people who for example take a bus at 21 then also take a bus after midnight,
--but this doesn't happen very often and we can consider it a statistical anomaly.
GROUP BY
month,
start_bus_stop_name,
start_bus_stop_code,
end_bus_stop_name,
end_bus_stop_code
ORDER BY
month ASC;
--Clearing results_agg_month table to make way for new data
TRUNCATE city_bus.results_agg_month CASCADE;
\echo '\nInserting into city_bus.results_agg_month...\n'
--Aggregate monthly data to neigborhood level
INSERT INTO city_bus.results_agg_month
WITH
sq1 AS (
SELECT
a.month AS month,
b.neighborhood_id AS start_neighborhood_id,
b.neighborhood_name AS start_neighborhood_name,
b.share AS start_share,
c.neighborhood_id AS end_neighborhood_id,
c.neighborhood_name AS end_neighborhood_name,
c.share AS end_share,
a.count
FROM
city_bus.trips_month AS a,
city_bus.bus_stop_neighborhood_share AS b,
city_bus.bus_stop_neighborhood_share AS c
WHERE
a.start_bus_stop_code = b.stop_code AND
a.end_bus_stop_code = c.stop_code
)
SELECT
month,
start_neighborhood_name,
start_neighborhood_id,
end_neighborhood_name,
end_neighborhood_id,
SUM(start_share*end_share*count) AS count
FROM
sq1
WHERE
start_neighborhood_id <> end_neighborhood_id AND
count > 0
GROUP BY
month,
start_neighborhood_name,
start_neighborhood_id,
end_neighborhood_name,
end_neighborhood_id
ORDER BY
month;
\echo '\nDropping City Bus temporary tables...\n'
DROP TABLE city_bus.valid_records;
......
......@@ -226,37 +226,62 @@ CREATE TABLE spatial_reference.county_stop_clusters_new AS
\echo '\nCreating Voronoi polygons for bus stops...\n'
--Create voronois for the stops
--This one is for the earlier dataset
DROP TABLE IF EXISTS spatial_reference.county_bus_stops_voronoi_geom_old;
CREATE TABLE spatial_reference.county_bus_stops_voronoi_geom_old AS
SELECT (ST_DUMP(ST_VoronoiPolygons(ST_Collect(geom)))).geom AS geom
FROM spatial_reference.county_stop_clusters_old;
--Create index for faster calculation
CREATE INDEX county_bus_stops_voronoi_geom_old_idx
ON spatial_reference.county_bus_stops_voronoi_geom_old
USING GIST(geom);
--Get intersecting voronois
DROP TABLE IF EXISTS spatial_reference.county_bus_stops_voronoi_geom_intersecting_old;
CREATE TABLE spatial_reference.county_bus_stops_voronoi_geom_intersecting_old AS
SELECT ST_Intersection(a.geom,b.geom) AS geom
FROM spatial_reference.county_bus_stops_voronoi_geom_old AS a, spatial_reference.tartu_county_contour AS b;
--Create index for faster calculation
CREATE INDEX county_bus_stops_voronoi_geom_intersecting_old_idx
ON spatial_reference.county_bus_stops_voronoi_geom_intersecting_old
USING GIST(geom);
DROP TABLE IF EXISTS spatial_reference.county_bus_stops_voronoi_old;
CREATE TABLE spatial_reference.county_bus_stops_voronoi_old AS
WITH
sq1 AS (
SELECT (ST_DUMP(ST_VoronoiPolygons(ST_Collect(geom)))).geom AS geom
FROM spatial_reference.county_stop_clusters_old
),
sq2 AS (
SELECT ST_Intersection(a.geom,b.geom) AS geom
FROM sq1 AS a, spatial_reference.tartu_county_contour AS b
)
SELECT a.start_bus_stop, b.geom
FROM spatial_reference.county_stop_clusters_old AS a, sq2 AS b
WHERE ST_intersects(a.geom,b.geom);
SELECT a.start_bus_stop, b.geom
FROM spatial_reference.county_stop_clusters_old AS a, spatial_reference.county_bus_stops_voronoi_geom_intersecting_old AS b
WHERE ST_intersects(a.geom,b.geom);
--Create voronois for the stops
--This one is for the newer dataset
DROP TABLE IF EXISTS spatial_reference.county_bus_stops_voronoi_geom_new;
CREATE TABLE spatial_reference.county_bus_stops_voronoi_geom_new AS
SELECT (ST_DUMP(ST_VoronoiPolygons(ST_Collect(geom)))).geom AS geom
FROM spatial_reference.county_stop_clusters_new;
--Create index for faster calculation
CREATE INDEX county_bus_stops_voronoi_geom_new_idx
ON spatial_reference.county_bus_stops_voronoi_geom_new
USING GIST(geom);
--Get intersecting voronois
DROP TABLE IF EXISTS spatial_reference.county_bus_stops_voronoi_geom_intersecting_new;
CREATE TABLE spatial_reference.county_bus_stops_voronoi_geom_intersecting_new AS
SELECT ST_Intersection(a.geom,b.geom) AS geom
FROM spatial_reference.county_bus_stops_voronoi_geom_new AS a, spatial_reference.tartu_county_contour AS b;
--Create index for faster calculation
CREATE INDEX county_bus_stops_voronoi_geom_intersecting_new_idx
ON spatial_reference.county_bus_stops_voronoi_geom_intersecting_new
USING GIST(geom);
DROP TABLE IF EXISTS spatial_reference.county_bus_stops_voronoi_new;
CREATE TABLE spatial_reference.county_bus_stops_voronoi_new AS
WITH
sq1 AS (
SELECT (ST_DUMP(ST_VoronoiPolygons(ST_Collect(geom)))).geom AS geom
FROM spatial_reference.county_stop_clusters_new
),
sq2 AS (
SELECT ST_Intersection(a.geom,b.geom) AS geom
FROM sq1 AS a, spatial_reference.tartu_county_contour AS b
)
SELECT a.start_bus_stop, b.geom
FROM spatial_reference.county_stop_clusters_new AS a, sq2 AS b
WHERE ST_intersects(a.geom,b.geom);
SELECT a.start_bus_stop, b.geom
FROM spatial_reference.county_stop_clusters_new AS a, spatial_reference.county_bus_stops_voronoi_geom_intersecting_new AS b
WHERE ST_intersects(a.geom,b.geom);
\echo '\nCalculating Voronoi polygons share in territorial communities...\n'
--Calculate how much of voronoi is in which territorial community
......@@ -470,6 +495,109 @@ GROUP BY
ORDER BY
hour;
--Clearing trips_month table to make way for new data
TRUNCATE county_bus.trips_month CASCADE;
\echo '\nInserting into county_bus.trips_month...\n'
--Creates a table for countybus trips, grouped by month, start and end point
INSERT INTO county_bus.trips_month (
WITH tartu_county_lines_union AS (
SELECT * FROM county_bus.tartu_county_lines
UNION ALL
SELECT * FROM county_bus.neighbor_county_lines
)
SELECT
DATE_TRUNC('month', time_measure) AS month,
start_bus_stop,
start_bus_stop_code,
end_bus_stop,
SUM(count) AS count
FROM
tartu_county_lines_union
GROUP BY
month,
start_bus_stop_code,
start_bus_stop,
end_bus_stop
ORDER BY
month,
start_bus_stop,
end_bus_stop,
count
);
--Clearing results_agg_month table to make way for new data
TRUNCATE county_bus.results_agg_month CASCADE;
\echo '\nInserting into county_bus.results_agg_month...\n'
--Inserting data into trips_day table, county busrides grouped by day, start and end point
INSERT INTO county_bus.results_agg_month
WITH
lines_old AS (
SELECT
a.month AS month,
b.tc_kood AS start_tc_id,
b.tc_nimi AS start_tc_name,
b.share AS start_share,
c.tc_kood AS end_tc_id,
c.tc_nimi AS end_tc_name,
c.share AS end_share,
a.count::float AS count
FROM
county_bus.trips_month AS a,
spatial_reference.county_bus_stops_tc_old AS b,
spatial_reference.county_bus_stops_tc_old AS c
WHERE
a.month < TO_TIMESTAMP('01/07/2017', 'DD/MM/YYYY') AND
a.start_bus_stop_name=b.start_bus_stop AND
a.end_bus_stop_name=c.start_bus_stop
),
lines_new AS (
SELECT
a.month AS month,
b.tc_kood AS start_tc_id,
b.tc_nimi AS start_tc_name,
b.share AS start_share,
c.tc_kood AS end_tc_id,
c.tc_nimi AS end_tc_name,
c.share AS end_share,
a.count::float AS count
FROM
county_bus.trips_month AS a,
spatial_reference.county_bus_stops_tc_new AS b,
spatial_reference.county_bus_stops_tc_new AS c
WHERE
a.month > TO_TIMESTAMP('30/06/2017', 'DD/MM/YYYY') AND
a.start_bus_stop_name=b.start_bus_stop AND
a.end_bus_stop_name=c.start_bus_stop
),
lines AS (
SELECT * FROM lines_old
UNION ALL
SELECT * FROM lines_new
)
SELECT
month,
start_tc_name,
start_tc_id::int,
end_tc_name,
end_tc_id::int,
SUM(start_share*end_share*count) AS count
FROM
lines
WHERE
start_tc_id <> end_tc_id
GROUP BY
month,
start_tc_name,
start_tc_id,
end_tc_name,
end_tc_id
ORDER BY
month;
\echo '\nDropping County Bus temporary tables...\n'
--Dropping all temporary tables
DROP TABLE county_bus.tartu_county_lines_raw;
......@@ -480,5 +608,9 @@ DROP TABLE county_bus.stop_clusters_old;
DROP TABLE county_bus.stop_clusters_new;
DROP TABLE county_bus.neighbor_county_lines_raw;
DROP TABLE county_bus.neighbor_county_lines;
DROP TABLE spatial_reference.county_bus_stops_voronoi_geom_old;
DROP TABLE spatial_reference.county_bus_stops_voronoi_geom_intersecting_old;
DROP TABLE spatial_reference.county_bus_stops_voronoi_geom_new;
DROP TABLE spatial_reference.county_bus_stops_voronoi_geom_intersecting_new;
\echo '\nCounty Bus data processing complete!\n'
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment