Preprocessing
Autonomy
Digital Media
Self-Determination
Load Data
Relevant Papers
# define search term
regex_cmc <-
rx_with_any_case() %>%
rx_either_of(
"internet",
"cyber",
"online media",
"online communication",
"online social network",
"online communit",
"chat",
"email",
"computer-mediated",
"mobile phone",
"smartphone",
"instant mess",
"mobile mess",
"social media",
rx() %>% rx_find("social ") %>% rx_find("network") %>% rx_anything(mode = "lazy") %>% rx_find("site") %>% rx_anything(mode = "lazy"),
"information and communication technolog",
"facebook",
"instagram",
"snapchat",
"twitter",
"wechat",
"weibo",
"texting")
# select relevant papers
selection <- data %>%
filter(str_detect(abstract, regex_cmc) | str_detect(title, regex_cmc)) %>%
filter(publication_type == "journal-article")
# read in a list of all concepts (provided by OpenAlex)
concepts <- read_csv("data/concepts.csv")
# save all associated concepts
# otherwise will be lost in next step
all_concepts <- selection %>%
select(id, concept) %>%
mutate(all_concepts = concept) %>%
select(-concept)
# save papers that only have autonomy concept
# otherwise will be lost in next step
only_autonomy_concept <- selection %>%
unnest(concept) %>%
group_by(id) %>%
filter(n() == 1) %>%
ungroup()
# filter most likely concept for each paper
# that is, lowest concept_lecel and and highest concept_score
selected_papers <- selection %>%
unnest(concept) %>%
filter(concept_name != "Autonomy") %>%
group_by(id) %>%
arrange(concept_lecel, .by_group = TRUE) %>%
filter(row_number() == 1) %>%
ungroup()
# combine with saved only_autonomy and rejoin all concepts variable
selected_papers <- bind_rows(selected_papers, only_autonomy_concept)
selected_papers <- left_join(selected_papers, all_concepts)
selected_papers <- selected_papers %>% rename(main_concept = concept_name)
# inspect concepts
selected_papers %>%
count(main_concept) %>%
arrange(desc(n))
# A tibble: 19 × 2
main_concept n
<chr> <int>
1 Psychology 670
2 Computer science 408
3 Sociology 339
4 Business 204
5 Medicine 180
6 Political science 178
7 Engineering 10
8 Philosophy 9
9 Economics 8
10 History 3
11 Art 2
12 Biology 2
13 Geography 2
14 Knowledge management 2
15 Advertising 1
16 Autonomy 1
17 Econometrics 1
18 Public relations 1
19 Squatting position 1
# select only concepts of interest
coi <- c(
"Advertising",
"Autonomy",
"Business",
"Economics",
"Computer science",
"Political science",
"Psychology",
"Public relations",
"Sociology"
)
selected_papers <- selected_papers %>%
filter(main_concept %in% coi)
# check if abstract or covariates of interest have missing values
table(is.na(selected_papers$abstract))
FALSE
1810
FALSE
1810
FALSE
1810
Clean
# clean
clean_papers <- selected_papers %>%
# clean punctuation
mutate(clean_abstract = str_replace_all(abstract, "[:punct:]", "")) %>%
# clean symbols
mutate(clean_abstract = str_replace_all(clean_abstract, "[:symbol:]", "")) %>%
# clean numbers
mutate(clean_abstract = str_replace_all(clean_abstract, "[:digit:]", "")) %>%
#clean hashtags
mutate(clean_abstract = str_replace_all(clean_abstract, "#\\w+", "")) %>%
# clean unnecessary white spaces
mutate(clean_abstract = str_squish(clean_abstract)) %>%
# detect language
mutate(cld2_lang = cld2::detect_language(clean_abstract)) %>%
# and filter out non english abstracts
filter(cld2_lang == "en")
# check again if abstract or covariates of interest have missing values
table(is.na(clean_papers$clean_abstract))
FALSE
1744
FALSE
1744
FALSE
1744