【R】tidyquery
2021年2月28日
1. はじめに
tidyquery
は、tidyにSQLクエリをあつかえるパッケージです。
2. インストール
CRANからインストールできます。
install.packages("tidyquery")
3. 使ってみる
query()
と show_dplyr()
を使えます。
library(tidyquery) library(nycflights13) library(tidyverse) data(flights) head(flights) query( " SELECT origin, dest, COUNT(flight) AS num_flts, round(SUM(seats)) AS num_seats, round(AVG(arr_delay)) AS avg_delay FROM flights f LEFT OUTER JOIN planes p ON f.tailnum = p.tailnum WHERE distance BETWEEN 200 AND 300 AND air_time IS NOT NULL GROUP BY origin, dest HAVING num_flts > 3000 ORDER BY num_seats DESC, avg_delay ASC LIMIT 2;" )
> head(flights)
# A tibble: 6 x 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
# A tibble: 2 x 5
origin dest num_flts num_seats avg_delay
<chr> <chr> <int> <dbl> <dbl>
1 LGA DCA 4468 712643 6
2 EWR BOS 5247 611192 5
planes %>% filter(engine == "Turbo-jet") %>% query("SELECT manufacturer AS maker, COUNT(*) AS num_planes GROUP BY maker") %>% arrange(desc(num_planes)) %>% head(10)
R version 4.0.1 (2020-06-06) -- "See Things Now"
Copyright (C) 2020 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64 (64-bit)
R は、自由なソフトウェアであり、「完全に無保証」です。
一定の条件に従えば、自由にこれを再配布することができます。
配布条件の詳細に関しては、'license()' あるいは 'licence()' と入力してください。
R は多くの貢献者による共同プロジェクトです。
詳しくは 'contributors()' と入力してください。
また、R や R のパッケージを出版物で引用する際の形式については
'citation()' と入力してください。
'demo()' と入力すればデモをみることができます。
'help()' とすればオンラインヘルプが出ます。
'help.start()' で HTML ブラウザによるヘルプがみられます。
'q()' と入力すれば R を終了します。
> install.packages("tidyquery")
Installing package into ‘C:/Users/Dejima Shuichi/Documents/R/win-library/4.0’
(as ‘lib’ is unspecified)
also installing the dependency ‘queryparser’
URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/queryparser_0.3.1.zip' を試しています
Content type 'application/zip' length 214343 bytes (209 KB)
downloaded 209 KB
URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/tidyquery_0.2.2.zip' を試しています
Content type 'application/zip' length 70626 bytes (68 KB)
downloaded 68 KB
package ‘queryparser’ successfully unpacked and MD5 sums checked
package ‘tidyquery’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\Dejima Shuichi\AppData\Local\Temp\RtmpS8Fakk\downloaded_packages
> library(tidyquery)
警告メッセージ:
パッケージ ‘tidyquery’ はバージョン 4.0.3 の R の下で造られました
> library(nycflights13)
警告メッセージ:
パッケージ ‘nycflights13’ はバージョン 4.0.2 の R の下で造られました
>
>
> library(tidyquery)
> library(nycflights13)
>
>
> query(
+ " SELECT origin, dest,
+ COUNT(flight) AS num_flts,
+ round(SUM(seats)) AS num_seats,
+ round(AVG(arr_delay)) AS avg_delay
+ FROM flights f LEFT OUTER JOIN planes p
+ ON f.tailnum = p.tailnum
+ WHERE distance BETWEEN 200 AND 300
+ AND air_time IS NOT NULL
+ GROUP BY origin, dest
+ HAVING num_flts > 3000
+ ORDER BY num_seats DESC, avg_delay ASC
+ LIMIT 2;"
+ )
# A tibble: 2 x 5
origin dest num_flts num_seats avg_delay
<chr> <chr> <int> <dbl> <dbl>
1 LGA DCA 4468 712643 6
2 EWR BOS 5247 611192 5
> flight
エラー: オブジェクト 'flight' がありません
> data(flights)
> dat <- flights
> View(dat)
> head(flights)
# A tibble: 6 x 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
# ... with 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
> library(tidyverse)
-- Attaching packages --------------------------------------- tidyverse 1.3.0 --
√ ggplot2 3.3.2 √ purrr 0.3.4
√ tibble 3.0.3 √ dplyr 1.0.0
√ tidyr 1.1.0 √ stringr 1.4.0
√ readr 1.3.1 √ forcats 0.5.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
警告メッセージ:
1: パッケージ ‘tidyverse’ はバージョン 4.0.2 の R の下で造られました
2: パッケージ ‘ggplot2’ はバージョン 4.0.2 の R の下で造られました
3: パッケージ ‘tibble’ はバージョン 4.0.2 の R の下で造られました
> planes %>%
+ filter(engine == "Turbo-fan") %>%
+ query("SELECT manufacturer AS maker, COUNT(*) AS num_planes GROUP BY maker") %>%
+ arrange(desc(num_planes)) %>%
+ head(5)
# A tibble: 5 x 2
maker num_planes
<chr> <int>
1 BOEING 1276
2 BOMBARDIER INC 368
3 AIRBUS 331
4 EMBRAER 298
5 AIRBUS INDUSTRIE 270
> planes
# A tibble: 3,322 x 9
tailnum year type manufacturer model engines seats speed
<chr> <int> <chr> <chr> <chr> <int> <int> <int>
1 N10156 2004 Fixe~ EMBRAER EMB-~ 2 55 NA
2 N102UW 1998 Fixe~ AIRBUS INDU~ A320~ 2 182 NA
3 N103US 1999 Fixe~ AIRBUS INDU~ A320~ 2 182 NA
4 N104UW 1999 Fixe~ AIRBUS INDU~ A320~ 2 182 NA
5 N10575 2002 Fixe~ EMBRAER EMB-~ 2 55 NA
6 N105UW 1999 Fixe~ AIRBUS INDU~ A320~ 2 182 NA
7 N107US 1999 Fixe~ AIRBUS INDU~ A320~ 2 182 NA
8 N108UW 1999 Fixe~ AIRBUS INDU~ A320~ 2 182 NA
9 N109UW 1999 Fixe~ AIRBUS INDU~ A320~ 2 182 NA
10 N110UW 1999 Fixe~ AIRBUS INDU~ A320~ 2 182 NA
# ... with 3,312 more rows, and 1 more variable: engine <chr>
> table(planes$model)
150 172E 172M
1 1 1
172N 206B 210-5(205)
1 1 1
230 310Q 421C
1 1 1
550 60 65-A90
1 1 1
717-200 737-301 737-317
88 2 1
737-3A4 737-3G7 737-3H4
1 2 105
737-3K2 737-3L9 737-3Q8
2 2 5
737-3T5 737-3TO 737-3Y0
1 2 1
737-401 737-4B7 737-524
4 18 1
737-5H4 737-705 737-724
15 2 32
737-732 737-76N 737-76Q
10 23 2
737-7AD 737-7BD 737-7BX
1 24 3
737-7H4 737-7Q8 737-824
361 1 122
737-832 737-890 737-8FH
73 59 1
737-8H4 737-924 737-924ER
50 12 75
737-990 737-990ER 747-451
11 13 1
757-212 757-222 757-223
2 80 16
757-224 757-231 757-232
41 8 94
757-251 757-26D 757-2B7
29 1 2
757-2Q8 757-324 757-33N
10 9 12
757-351 767-201 767-223
4 1 13
767-224 767-322 767-323
2 32 27
767-324 767-332 767-3P6
1 55 6
767-424ER 767-432ER 777-200
16 17 1
777-222 777-224 777-232
4 4 3
787-8 A-1B A109E
4 1 1
A185F A319-112 A319-114
1 68 57
A319-115 A319-131 A319-132
3 55 25
A320-211 A320-212 A320-214
27 42 82
A320-231 A320-232 A321-211
8 256 43
A321-231 A330-223 A330-243
51 3 14
A330-323 A340-313 AT-5
1 1 1
CF-5D CL-600-2B19 CL-600-2C10
1 171 83
CL-600-2D24 DC-7BF DC-9-51
123 1 8
DC-9-82(MD-82) DC-9-83(MD-83) E-90
56 25 1
EMB-145 EMB-145LR EMB-145XR
1 114 104
ERJ 190-100 IGW FALCON-XP FALCON XP
80 3 6
G-IV G1159B KITFOX IV
1 1 1
MD-88 MD-90-30 MYSTERE FALCON 900
117 41 1
OTTER DHC-3 PA-28-180 PA-31-350
1 1 2
PA-32R-300 PA-32RT-300 R66
1 1 1
S-76A SR22 VANS AIRCRAFT RV6
1 1 1
ZODIAC 601HDS
1
> table(planes$engine)
4 Cycle Reciprocating Turbo-fan Turbo-jet
2 28 2750 535
Turbo-prop Turbo-shaft
2 5
> planes %>%
+ filter(engine == "Turbo-jet") %>%
+ query("SELECT manufacturer AS maker, COUNT(*) AS num_planes GROUP BY maker") %>%
+ arrange(desc(num_planes)) %>%
+ head(10)
# A tibble: 8 x 2
maker num_planes
<chr> <int>
1 BOEING 354
2 AIRBUS INDUSTRIE 130
3 MCDONNELL DOUGLAS 29
4 MCDONNELL DOUGLAS CORPORATION 14
5 AIRBUS 5
6 CANADAIR LTD 1
7 EMBRAER 1
8 GULFSTREAM AEROSPACE 1
> library(tidyquery)
> library(nycflights13)
> library(tidyverse)
> data(flights)
> head(flights)
# A tibble: 6 x 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
# ... with 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
> query(
+ " SELECT origin, dest,
+ COUNT(flight) AS num_flts,
+ round(SUM(seats)) AS num_seats,
+ round(AVG(arr_delay)) AS avg_delay
+ FROM flights f LEFT OUTER JOIN planes p
+ ON f.tailnum = p.tailnum
+ WHERE distance BETWEEN 200 AND 300
+ AND air_time IS NOT NULL
+ GROUP BY origin, dest
+ HAVING num_flts > 3000
+ ORDER BY num_seats DESC, avg_delay ASC
+ LIMIT 2;"
+ )
# A tibble: 2 x 5
origin dest num_flts num_seats avg_delay
<chr> <chr> <int> <dbl> <dbl>
1 LGA DCA 4468 712643 6
2 EWR BOS 5247 611192 5
> planes %>%
+ filter(engine == "Turbo-jet") %>%
+ query("SELECT manufacturer AS maker, COUNT(*) AS num_planes GROUP BY maker") %>%
+ arrange(desc(num_planes)) %>%
+ head(10)
# A tibble: 8 x 2
maker num_planes
<chr> <int>
1 BOEING 354
2 AIRBUS INDUSTRIE 130
3 MCDONNELL DOUGLAS 29
4 MCDONNELL DOUGLAS CORPORATION 14
5 AIRBUS 5
6 CANADAIR LTD 1
7 EMBRAER 1
8 GULFSTREAM AEROSPACE 1
show_dplyr( " SELECT manufacturer, COUNT(*) AS num_planes FROM planes WHERE engine = 'Turbo-fan' GROUP BY manufacturer ORDER BY num_planes DESC;" )
planes %>%
filter(engine == "Turbo-fan") %>%
group_by(manufacturer) %>%
summarise(num_planes = dplyr::n()) %>%
ungroup() %>%
arrange(dplyr::desc(num_planes))
4. さいごに
SQLはあまり使ってないので、これを機に再勉強しようか。