[R] 이기적 스터디 카페 3주차 예상문제

: ) YOUNG·2022년 6월 19일

R 빅데이터분석기사 빅분기

빅분기

목록 보기

19/20

1유형 1

data = https://raw.githubusercontent.com/Datamanim/datarepo/main/insurance/train.csv

Q1. Vehicle_Age 값이 2년 이상인 사람들만 필터링 하고 그중에서 Annual_Premium 값이 전체 데이터의 중간값 이상인 사람들을 찾고, 그들의 Vintage값의 평균을 구하여라

library(dplyr)

main <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/insurance/train.csv',
    encoding = 'UTF-8'
)

# Q1. Vehicle_Age 값이 2년 이상인 사람들만 필터링 하고 그중에서
# Annual_Premium 값이 전체 데이터의 중간값 이상인 사람들을 찾고, 그들의 Vintage값의 평균을 구하여라

ds1 <- main %>% filter(Vehicle_Age == '> 2 Years')
me <- median(main$Annual_Premium)
result1 <- ds1 %>% filter(Annual_Premium >= me) %>% summarise(mean = mean(Vintage))
print(result1)

> print(result1)
      mean
1 154.4365

Q2. vehicle_age에 따른 각 성별(gender)그룹의 Annual_Premium값의 평균을 구하여 아래 테이블과 동일하게 구현하라


ds2 <- main
temp <- aggregate(
    Annual_Premium ~ Vehicle_Age + Gender,
    ds2,
    mean
)

library(reshape2)
result2 <- dcast(temp, Vehicle_Age ~ Gender, value.var = 'Annual_Premium' )
print(result2)


> result2
  Vehicle_Age   Female     Male
1    < 1 Year 29972.29 30310.98
2   > 2 Years 36108.37 35303.87
3    1-2 Year 30762.25 30413.09

2유형 1

심장질환예측

평가지표 : f1-score

trainData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/train.csv

testData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/test.csv

subData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/submission.csv


library(dplyr)
library(caret)
library(randomForest)
library(ModelMetrics)
library(tidyr)
library(readr)

# 데이터 설명 : 심장질환예측 target 컬럼

train <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/train.csv',
    encoding = 'UTF-8',
    na.strings = c('', ' ', 'NA', NA)
)

test <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/test.csv',
    encoding = 'UTF-8',
    na.strings = c('', ' ', 'NA', NA)
)

sub <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/submission.csv',
    encoding = 'UTF-8',
    na.strings = c('', ' ', 'NA', NA)
)

# 결측값 없음
colSums(is.na(train))

train$sex <- as.factor(ifelse(train$sex == 1 , 'male', 'female'))
test$sex <- as.factor(ifelse(test$sex == 1 , 'male', 'female'))

train$exang <- as.factor(ifelse(train$exang == 1 , 'yes', 'no'))
test$exang <- as.factor(ifelse(test$exang == 1 , 'yes', 'no'))

train$fbs <- as.factor(ifelse(train$fbs == 1 , 'true', 'false'))
test$fbs <- as.factor(ifelse(test$fbs == 1 , 'true', 'false'))

train$thal <- as.factor(ifelse(train$thal == 1 , 'normal', ifelse(train$thal == 2, 'fixed', 'reversable') ))
test$thal <- as.factor(ifelse(test$thal == 1 , 'normal', ifelse(test$thal == 2, 'fixed', 'reversable') ))

train$restecg <- as.factor(train$restecg)
test$restecg <- as.factor(test$restecg)

train$cp <- as.factor(train$cp)
test$cp <- as.factor(test$cp)

train$slope <- as.factor(train$slope)
test$slope <- as.factor(test$slope)

train$ca <- as.factor(train$ca)
test$ca <- as.factor(test$ca)

train$target <- as.factor(train$target)


str(train)
model <- preProcess(
    train[, -c(1)],
    method = c('range')
)

train <- predict(
    model,
    train
)

model <- preProcess(
    test[, -c(1)],
    method = c('range')
)

test <- predict(
    model,
    test
)


rf <- randomForest(
    target ~ .,
    train,
    do.trace = TRUE,
    ntree = 400
)


pred <- predict(
    rf,
    newdata = test
)

sub$X0 <- as.factor(sub$X0)
levels(sub$X0) <- c('0', '1')

caret::confusionMatrix(
    pred,
    sub$X0 ,
    positive = '1',
    mode = 'everything'
)


result <- data.frame(
    pred
)

names(result) <- 'target'

write.csv(result, "result.csv", row.names = F)


> str(train)
'data.frame':   242 obs. of  14 variables:
 $ age     : int  60 51 51 59 60 46 54 45 54 44 ...
 $ sex     : Factor w/ 2 levels "female","male": 2 1 2 2 2 1 2 1 2 1 ...
 $ cp      : Factor w/ 4 levels "0","1","2","3": 1 3 4 3 1 3 1 2 2 3 ...
 $ trestbps: num  0.434 0.245 0.292 0.528 0.292 ...
 $ chol    : num  0.381 0.386 0.199 0.196 0.301 ...
 $ fbs     : Factor w/ 2 levels "false","true": 1 1 1 2 1 1 1 1 1 1 ...
 $ restecg : Factor w/ 3 levels "0","1","2": 1 1 1 2 1 1 2 1 2 2 ...
 $ thalach : num  0.762 0.638 0.333 0.638 0.486 ...
 $ exang   : Factor w/ 2 levels "no","yes": 1 1 2 1 2 2 1 1 1 1 ...
 $ oldpeak : num  0.214 0.107 0.25 0.286 0.5 ...
 $ slope   : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 1 2 2 3 2 ...
 $ ca      : Factor w/ 5 levels "0","1","2","3",..: 3 1 2 1 2 1 2 1 1 1 ...
 $ thal    : Factor w/ 3 levels "fixed","normal",..: 3 1 1 1 3 1 3 1 3 1 ...
 $ target  : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 1 2 2 2 ...



> str(test)
'data.frame':   61 obs. of  13 variables:
 $ age     : int  62 54 64 56 40 41 58 51 54 64 ...
 $ sex     : Factor w/ 2 levels "female","male": 1 2 1 2 2 1 2 1 2 2 ...
 $ cp      : Factor w/ 4 levels "0","1","2","3": 1 3 1 3 1 3 1 3 1 4 ...
 $ trestbps: num  0.3 0.25 1 0.375 0.125 0.15 0.575 0.25 0.5 0.125 ...
 $ chol    : num  0.295 0.47 0.708 0.463 0.146 ...
 $ fbs     : Factor w/ 2 levels "false","true": 1 1 1 2 1 1 1 1 1 1 ...
 $ restecg : Factor w/ 3 levels "0","1","2": 2 1 2 1 1 1 2 1 2 1 ...
 $ thalach : num  0.716 0.519 0.605 0.457 0.111 ...
 $ exang   : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 1 2 ...
 $ oldpeak : num  0 0.1 0 0.15 0.5 0 0.5 0.15 0.3 0.45 ...
 $ slope   : Factor w/ 3 levels "0","1","2": 3 2 3 2 2 3 2 3 3 2 ...
 $ ca      : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 2 1 1 2 1 1 1 ...
 $ thal    : Factor w/ 3 levels "fixed","normal",..: 1 3 1 2 3 1 3 1 1 1 ...





> head(result, 10)
   target
1       1
2       1
3       1
4       0
5       0
6       1
7       0
8       1
9       1
10      1

1유형 2

Q1. price_range 의 각 value를 그룹핑하여 각 그룹의 n_cores 의 빈도가 가장높은 value와 그 빈도수를 구하여라


library(dplyr)
library(tidyr)

main <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/train.csv',
    encoding = 'UTF-8'
)


# Q1. price_range 의 각 value를 그룹핑하여 각 그룹의 n_cores 의 빈도가 가장높은 value와 그 빈도수를 구하여라

ds1 <- main
result1 <- ds1 %>% group_by(price_range, n_cores) %>% summarise(n = n()) %>% arrange(desc(n)) %>% slice(1)
print(result1)

> print(result1)
# A tibble: 4 x 3
# Groups:   price_range [4]
  price_range n_cores     n
        <int>   <int> <int>
1           0       2    69
2           1       1    76
3           2       4    73
4           3       5    70

2유형 2

핸드폰 가격예측 (price_range컬럼 0(저렴) ~3(매우비쌈) 범위 )

문제타입 : 분류유형

평가지표 : accuracy

trainData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/train.csv

testData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/test.csv


library(dplyr)
library(caret)
library(ModelMetrics)
library(scales)
library(rpart)


# 데이터 설명 : 핸드폰 가격예측 (price_range컬럼 0(저렴) ~3(매우비쌈) 범위 ) 

train <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/train.csv',
    encoding = 'UTF-8',
    na.strings = c('', ' ', NA)
)

test <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/test.csv',
    encoding = 'UTF-8',
    na.strings = c('', ' ', NA)
)

train$battery_power <- normal(train$battery_power)
head(train$battery_power)


str(train)
unique(train$m_dep)


# 종속변수 
train$price_range <- as.factor(train$price_range)

# blue
train$blue <- as.factor(train$blue)
test$blue <- as.factor(test$blue)

# three_g
train$three_g <- as.factor(train$three_g)
test$three_g <- as.factor(test$three_g)

# touch_screen
train$touch_screen <- as.factor(train$touch_screen)
test$touch_screen <- as.factor(test$touch_screen)

# wifi
train$wifi <- as.factor(train$wifi)
test$wifi <- as.factor(test$wifi)

# n_cores
train$n_cores <- as.factor(train$n_cores)
test$n_cores <- as.factor(test$n_cores)

# four_g
train$four_g <- as.factor(train$four_g)
test$four_g <- as.factor(test$four_g)

# dual_sim
train$dual_sim <- as.factor(train$dual_sim)
test$dual_sim <- as.factor(test$dual_sim)



# 데이터 스케일링
model <- preProcess(
    train,
    method = c('range')
)

train2 <- predict(
    model,
    train
)

model <- preProcess(
    test[,1],
    method = c('range')
)

test2 <- predict(
    model,
    test
)


rp <- rpart(
    price_range ~ .,
    train2
)

rp.p <- predict(
    rp,
    newdata = test2,
    type = 'class'
)
)

set.seed(21)
parts <- sample(
    1:nrow(train),
    size = 1000
)

sam <- train[parts, ]
caret::confusionMatrix(
    rp.p,
    sam$price_range,
    mode = 'everything'
)

result <- data.frame(
    test$id, 
    rp.p
)

names(result) <- c('id', 'price_range')
write.csv(result, 'result.csv', row.names = FALSE)


> head(result)
  id price_range
1  1           3
2  2           3
3  3           2
4  4           3
5  5           1
6  6           3

: ) YOUNG

이전 포스트

군집 분석 Clustering Analytic

다음 포스트

[R] 이기적 스터디 카페 3주차 예상문제

빅분기

1유형 1

Q1. Vehicle_Age 값이 2년 이상인 사람들만 필터링 하고 그중에서 Annual_Premium 값이 전체 데이터의 중간값 이상인 사람들을 찾고, 그들의 Vintage값의 평균을 구하여라

Q2. vehicle_age에 따른 각 성별(gender)그룹의 Annual_Premium값의 평균을 구하여 아래 테이블과 동일하게 구현하라

2유형 1

심장질환예측

1유형 2

Q1. price_range 의 각 value를 그룹핑하여 각 그룹의 n_cores 의 빈도가 가장높은 value와 그 빈도수를 구하여라

2유형 2

핸드폰 가격예측 (price_range컬럼 0(저렴) ~3(매우비쌈) 범위 )

군집 분석 Clustering Analytic

[R] 이기적 스터디 카페 5주차 예상문제

0개의 댓글