#Loading tidyverse and other libraries
Get tidytuesday data.
installed.packages("tidytuesdayR")
## Package LibPath Version Priority Depends Imports LinkingTo Suggests
## Enhances License License_is_FOSS License_restricts_use OS_type Archs
## MD5sum NeedsCompilation Built
library(tidytuesdayR)
# Get the Data
# Read in with tidytuesdayR package
# Install from CRAN via: install.packages("tidytuesdayR")
# This loads the readme and all the datasets for the week of interest
# Either ISO-8601 date or year/week works!
#tuesdata <- tidytuesdayR::tt_load('2021-09-28')
#tuesdata <- tidytuesdayR::tt_load(2021, week = 40)
#papers <- tuesdata$papers
# Or read in the data manually
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/papers.csv') papers
## Rows: 29434 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): paper, title
## dbl (2): year, month
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/authors.csv') authors
## Rows: 15437 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): author, name, user_nber, user_repec
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/programs.csv') programs
## Rows: 21 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): program, program_desc, program_category
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/paper_authors.csv') paper_authors
## Rows: 67090 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): paper, author
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/paper_programs.csv') paper_programs
## Rows: 53996 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): paper, program
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Installing didn’t work so I loaded the data in manually.
#viewing the data
summary(authors)
## author name user_nber user_repec
## Length:15437 Length:15437 Length:15437 Length:15437
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
summary(paper_authors)
## paper author
## Length:67090 Length:67090
## Class :character Class :character
## Mode :character Mode :character
summary(paper_programs)
## paper program
## Length:53996 Length:53996
## Class :character Class :character
## Mode :character Mode :character
summary(papers)
## paper year month title
## Length:29434 Min. :1973 Min. : 1.000 Length:29434
## Class :character 1st Qu.:1999 1st Qu.: 4.000 Class :character
## Mode :character Median :2008 Median : 7.000 Mode :character
## Mean :2006 Mean : 6.534
## 3rd Qu.:2015 3rd Qu.: 9.000
## Max. :2021 Max. :12.000
summary(programs)
## program program_desc program_category
## Length:21 Length:21 Length:21
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
Summaries show that there are
#Data wrangling
##Merge all of the data together
<-full_join(papers, paper_programs, by = "paper") %>%
allthethingsfull_join(paper_authors, by="paper") %>%
full_join(authors, by= "author")%>%
full_join(programs, by="program")%>%
drop_na() #remove NA data
summary(allthethings)
## paper year month title
## Length:82091 Min. :1975 Min. : 1.000 Length:82091
## Class :character 1st Qu.:2005 1st Qu.: 4.000 Class :character
## Mode :character Median :2012 Median : 6.000 Mode :character
## Mean :2010 Mean : 6.492
## 3rd Qu.:2017 3rd Qu.: 9.000
## Max. :2021 Max. :12.000
## program author name user_nber
## Length:82091 Length:82091 Length:82091 Length:82091
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## user_repec program_desc program_category
## Length:82091 Length:82091 Length:82091
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##Shape up the data using dyplr.
<-allthethings%>%
dataselect(year, month,program_category, program, program_desc)%>% #select for points of interests
drop_na()%>% #remove rows with NA
mutate(program_category=as.factor(program_category), program=as.factor(program), program_desc=as.factor(program_desc)) #turn characters into factors
summary(data)
## year month program_category program
## Min. :1975 Min. : 1.000 Finance : 7398 EFG : 9985
## 1st Qu.:2005 1st Qu.: 4.000 Macro/International:26421 LS : 8994
## Median :2012 Median : 6.000 Micro :48272 PE : 8763
## Mean :2010 Mean : 6.492 IFM : 6266
## 3rd Qu.:2017 3rd Qu.: 9.000 ME : 5198
## Max. :2021 Max. :12.000 ITI : 4972
## (Other):37913
## program_desc
## Economic Fluctuations and Growth : 9985
## Labor Studies : 8994
## Public Economics : 8763
## International Finance and Macroeconomics: 6266
## Monetary Economics : 5198
## International Trade and Investment : 4972
## (Other) :37913
##Play with some graphics
Papers published throughout the years
<-data%>%
CountedCatsgroup_by(year,program_category)%>%
count(program_category)
%>%
CountedCatsfilter(year!="2021")%>%
ggplot(aes(year,n))+
geom_path(aes(group="program_category"))+
labs(x="Year", y="Papers Published")
str(data)
## tibble [82,091 x 5] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:82091] 1975 1975 1976 1976 1976 ...
## $ month : num [1:82091] 5 5 3 3 3 4 10 8 7 1 ...
## $ program_category: Factor w/ 3 levels "Finance","Macro/International",..: 2 2 3 2 2 3 3 3 2 3 ...
## $ program : Factor w/ 20 levels "AG","AP","CF",..: 12 14 11 12 14 11 16 16 17 18 ...
## $ program_desc : Factor w/ 20 levels "Asset Pricing",..: 13 14 11 13 14 11 15 15 17 20 ...
%>%
datafilter(year!="2021")%>%
group_by(year, program_desc)%>%
count(program_category)%>%
ggplot(aes(year, n, fill=program_category))+
geom_bar(position = "stack", stat = "identity")+
scale_x_continuous(breaks = seq(1975, 2020, 25 ))+
facet_wrap(~program_desc)+
labs(x="Year", y="Papers Published", title="Published Papers Over the Years")
%>%
datafilter(year!="2021")%>%
group_by(year, program_category)%>%
count(program_desc)%>%
ggplot(aes(year, n, fill=program_desc))+
geom_bar(position = "stack", stat = "identity")+
labs(x="Year", y="Papers Published", title="Published Papers Over the Years")+
scale_x_continuous(breaks = seq(1975, 2020, 10 ))+
facet_wrap(~program_category)