이전 게시글에 LEFT JOIN
을 한 이유에 대해 설명하지 않은 것 같아서 작성해보려고 한다. 만약 임기 시작 날짜와 끝나는 날짜가 다음과 같다고 가정한다.
date_dim 테이블의 date 컬럼에서 12월 31일에 해당하는 날짜만을 가지고 JOIN 하기 때문에 해당 날짜와 매칭되는 날짜가 존재하지 않는다. 그런 경우 date 컬럼에 NULL이 입력되며, 추후 NULL을 COALESCE 함수를 통해 0으로 변경해주는 것이다.
JOIN된 테이블의 첫 임기 시작 날짜에서 연도만을 추출한다.
date - first_term을 통해 period를 계산하고, NULL은 0으로 변경해준다. 이후 고유한 ID들의 COUNT를 계산하면 코호트 숫자가 계산된다.
SELECT
DATE_PART('year', A.first_term) AS first_year,
COALESCE(DATE_PART('year', AGE(C.date, A.first_term)), 0) AS period,
COUNT(DISTINCT A.id_bioguide) AS cohort_retained
FROM
(
SELECT id_bioguide, MIN(term_start) AS first_term
FROM legislators_terms
GROUP BY id_bioguide
) AS A JOIN legislators_terms AS B ON A.id_bioguide= B.id_bioguide
LEFT JOIN date_dim AS C ON C.date BETWEEN B.term_start AND B.term_end
AND C.month_name = 'December' AND C.day_of_month = 31
GROUP BY 1, 2
ORDER BY 1, 2;
SELECT
first_year,
period,
cohort_retained,
FIRST_VALUE(cohort_retained) OVER(PARTITION BY first_year ORDER BY period) AS cohort_size,
ROUND(cohort_retained * 1.0 / FIRST_VALUE(cohort_retained) OVER(PARTITION BY first_year ORDER BY period), 2) AS pct_retained
FROM
(
SELECT
DATE_PART('year', A.first_term) AS first_year,
COALESCE(DATE_PART('year', AGE(C.date, A.first_term)), 0) AS period,
COUNT(DISTINCT A.id_bioguide) AS cohort_retained
FROM
(
SELECT id_bioguide, MIN(term_start) AS first_term
FROM legislators_terms
GROUP BY id_bioguide
) AS A JOIN legislators_terms AS B ON A.id_bioguide= B.id_bioguide
LEFT JOIN date_dim AS C ON C.date BETWEEN B.term_start AND B.term_end
AND C.month_name = 'December' AND C.day_of_month = 31
GROUP BY 1, 2
ORDER BY 1, 2
) AS sub
ORDER BY 1,2;
SELECT
DISTINCT(id_bioguide),
MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start ASC) AS first_state
FROM legislators_terms;
SELECT
first_state,
period,
cohort_retained,
FIRST_VALUE(cohort_retained) OVER(PARTITION BY first_state ORDER BY period ASC) AS cohort_size,
ROUND(cohort_retained * 1.0 / FIRST_VALUE(cohort_retained) OVER(PARTITION BY first_state ORDER BY period ASC), 2) AS pct_retained
FROM
(
SELECT
A.first_state,
COALESCE(DATE_PART('year', AGE(C.date, A.first_term)), 0) AS period,
COUNT(DISTINCT(A.id_bioguide)) AS cohort_retained
FROM
(
SELECT
DISTINCT(id_bioguide),
MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start ASC) AS first_state
FROM legislators_terms
) AS A JOIN legislators_terms AS B ON A.id_bioguide = B.id_bioguide
LEFT JOIN date_dim AS C ON C.date BETWEEN B.term_start AND B.term_end
AND C.month_name = 'December' AND C.day_of_month = 31
GROUP BY 1, 2
) AS sub
ORDER BY first_state ASC, period ASC;