오늘도 기초프로젝트 진행하여 해본 내용들 코드와 결과 정리합니다!
# 4개 테이블 left join
# Table : customers, orders, sales, products
SELECT count(*)
from customers c left join orders o on c.customer_id = o.customer_id
left join sales s on o.order_id = s.order_id
left join products p on s.product_id = p.product_id
where o.order_id is not null
-> SQL결과를 파이썬으로 data.info() 한 결과이다.
orders 테이블에 조인되지 않는 값 7개 존재 (join X : ordes → sales)
- order_id = (227, 332, 536, 598, 618, 823, 1000)
최종으로는 Null값 없는 5,000개 Row가 존재해야 맞는 것 같음
위 order_id에 해당하는 번호들이 customers와는 Join되고 sales에서는 Join되지 않아 차이가 발생함을 찾아냄
-> 각각의 테이블을 Inner Join할 때는 문제가 되지않음
# customer 그룹화 진행 후 1,000명 -> 617명 중 상위 10% (60명)
CREATE view rn_price AS (
SELECT *
from
(SELECT customer_id, sum_pay,
RANK() over(order by sum_pay desc) as rn
FROM
(SELECT customer_id , sum(payment) as sum_pay
from orders o
group by 1
order by 2 desc) sub) main
where rn <= 60)
CREATE VIEW view_sub as(
SELECT last_t.customer_id,
last_t.sum_pay,
rn,
s.order_id,
product_id
from
(SELECT rn_price.customer_id,
rn_price.sum_pay,
rn,
o.order_id
from rn_price left join orders o on rn_price.customer_id = o.customer_id) last_t join sales s on last_t.order_id = s.order_id)
SELECT customer_id, sum_pay, rn
from view_sub join products p on view_sub.product_id = p.product_ID
ORDER BY 3;
# 2021-10-30 기준
# 지표 생성
SELECT customer_id
, MAX(order_date) as recent_date
, DATEDIFF('2021-10-30', MAX(order_date)) as recency
, count(customer_id) frequency
, sum(payment) monetary
from orders
GROUP BY customer_id
ORDER BY customer_id;
R, F, M 기준 정의
recency : 최근 구매
- 1달
- 2달
- 3달
- 3달 이상
frequency : 구매 빈도
- 4회 이상
- 2회 이상
- 1회(나머지)
monetary : 주문 총액
- 200,000$ 이상
- 150,000$ 이상
- 100,000$ 이상
- 50,000$ 이상
- 나머지
# RFM 점수 계산 with rfm AS ( SELECT customer_id , MAX(order_date) as recent_date , DATEDIFF('2021-10-30', MAX(order_date)) as recency , count(customer_id) frequency , sum(payment) monetary from orders GROUP BY customer_id ORDER BY customer_id ), rfm_score AS ( SELECT customer_id , recent_date , recency , frequency , monetary , CASE WHEN recency <= 30 THEN 4 WHEN recency <= 60 THEN 3 WHEN recency <= 90 THEN 2 ELSE 1 END AS R , CASE WHEN frequency >= 4 THEN 3 WHEN frequency >= 2 THEN 2 ELSE 1 END AS F , CASE WHEN monetary >= 200000 THEN 5 WHEN monetary >= 150000 THEN 4 WHEN monetary >= 100000 THEN 3 WHEN monetary >= 50000 THEN 2 ELSE 1 END AS M FROM rfm ) SELECT * , R+F+M RFM FROM rfm_score order by RFM desc;