5 Analysing engagement with local news outlets on social media

In this Chapter, I use my AcademicTwitter API access to extract and analyse engagement on Twitter between local news outlets and their communities.

# library(tidyverse)
# #devtools::install_github("cjbarrie/academictwitteR", build_vignettes = TRUE)
# library(academictwitteR)
# library(rvest)
# library(httr)
# library(purrr)

# set_bearer() # then set this up as an environment variable and restart R, so reload packages
# get_bearer()

5.1 Data Collection

5.1.1 Twitter handles

Through Media.info I was able to retrieve very few handles. Thus I resorted to extracting twitter handles from the available websites in my directory. Not all urls were still active, but luckily some of them redirected to their new sites. One example is the Evening Express, which is now under the Press and Journal, in Aberdeen.

From 1059 web urls, I manage to extract 450 handles.

# twitter_profiles <-
#   readRDS("directory_with_hyperlocals.RDS") %>% select(Publication,
#                                                        Twitter.x,
#                                                        Twitter.y,
#                                                        `Twitter url`,
#                                                        `Twitter handle`) %>% mutate(Twitter = coalesce(Twitter.x, Twitter.y, `Twitter handle`)) %>% select(-`Twitter handle`) %>% filter(!is.na(`Twitter url`)) %>% mutate(twitter_handle = str_remove(Twitter, "@")) %>% select(Publication, twitter_handle)
# 
# # urls <-
# #   directory_final %>% select(Publication, Website) %>% na.omit() %>%
# #   mutate(Website = str_remove(Website, ".* "))
# # 
# # # due to many webistes urls in my list having changed over time, here below, I rewrite my list to include redirected urls, then extract twitter handles from the redirection pages
# # urls$redirect_url <-
# #   do.call(rbind, lapply(urls$Website, function(x) {
# #     try(GET(x)$url)
# #   }))
# # urls <-
# #   urls %>% mutate(redirect_url = if_else(
# #     str_detect(redirect_url, "Error"),
# #     NA_character_,
# #     redirect_url
# #   ))
# # urls$twitter <- lapply(urls$redirect_url, function(url) {
# #   try(read_html(url) %>% html_elements(xpath = "//a[contains(@href, 'twitter.com/')]/@href") %>% html_text())
# # })
# # urls <- urls %>% rowwise %>% mutate(types = length(unique(twitter)),
# #                                     twitter = list(pluck(twitter, 1)))
# # saveRDS(urls, "urls.RDs")
# 
# urls <- readRDS("urls.RDS")
# 
# urls <- urls %>%
#   filter(!is.null(twitter)) %>%
#   mutate(
#     twitter = unlist(twitter),
#     twitter = if_else(str_detect(twitter, "Error"), NA_character_, twitter)
#   ) %>%
#   filter(!is.na(twitter)) %>%
#   mutate(
#     twitter_handle = str_remove(twitter, ".*twitter.com/"),
#     twitter_handle = str_remove(twitter_handle, "\\?.*"),
#     twitter_handle = str_remove(twitter_handle, "\\/.*"),
#     twitter_handle = str_remove(twitter_handle, "@")
#   ) %>%
#   full_join(twitter_profiles, by = c("Publication", "twitter_handle")) %>%
#   select(twitter_handle, Publication) %>%
#   filter(twitter_handle != "459701824769900545") %>% 
#   mutate(twitter_handle = if_else(Publication == "North Edinburgh Community News", "NorthEdinbNews", twitter_handle)) %>% 
#   pivot_wider(names_from = twitter_handle, values_from = Publication) %>%
#   pivot_longer(cols = everything()) %>%
#   rowwise %>%
#   mutate(titles_under_handle = length(unique(value))) 
# 
# urls$name[438] <- "myturriff"
# 
# plot <- ggplot(urls, aes(x = titles_under_handle)) +
#   geom_histogram(fill = "lightslateblue") +
#   theme_minimal() +
#   labs(x = "Number of titles per handle", y = "Number of titles")
# 
# jpgfile <-
#   fs::path(knitr::fig_path(),  "titles_by_twitterhandle.jpeg")
# ggsave(
#   jpgfile,
#   plot,
#   width = 40,
#   height = 10,
#   units = "cm",
#   scaling = 2
# )
# knitr::include_graphics(jpgfile)

5.1.2 News outlets id and profile information

# twitter_handles <- urls$name
# 
# # Extract and store ids for each handle
# ids <- data.frame()
# for(handle in twitter_handles) {
#   result <- get_user_id(handle, get_bearer())
#   outlet <- names(result)
#   pair <- bind_cols(outlet, result) %>% set_names(c("outlet", "id"))
#   ids <- bind_rows(ids, pair)
# }
# 
# saveRDS(ids, "twitter_ids.RDS")
# 
# # Extract geo location from Twitter for outlets (contained in profile)
# id <- na.omit(ids$id)
# outlets_profiles <- data.frame()
# for (i in id) {                
#   iteration_y <- get_user_profile(
#     x = i,
#     bearer_token = get_bearer())
#   outlets_profiles <- bind_rows(outlets_profiles, iteration_y)}
# 
# saveRDS(outlets_profiles, "outlets_profiles.RDS")

5.1.3 Tweets

Got all 2020, 2021, and 2022 tweets for 450 of my news outlets.

# twitter_handles <- urls$name
# 
# tweets <- data.frame()
# for (handle in twitter_handles) {                
#   iteration_i <- get_all_tweets(
#     users = handle,
#     start_tweets = "2020-01-01T00:00:00Z",        
#     end_tweets = "2023-01-01T00:00:00Z",
#     n = Inf,
#     bearer_token = get_bearer(),
#     data_path = "tweets/")
#   tweets <- bind_rows(tweets, iteration_i)}
# 
# saveRDS(tweets, "tweets.RDS")
# tweets <- readRDS("tweets/tweets.RDS")

5.1.4 Twitter profile followers

This took forever but now it’s done.

# ids <- drop_na(ids)
# id <- na.omit(ids$id)
# followers <- data.frame()
# for (i in id) {                
#   iteration_i <- get_user_followers(
#     x = i,
#     bearer_token = get_bearer())
#   followers <- bind_rows(followers, iteration_i)}
# 
# saveRDS(followers, "followers.RDS")

# currently gathered this data (out of 441 handles):
# -1:65 done 
# -100:120 done on Julien's PC
# -121:126 done in background job from script "followers_second_script.R"
# -400:end Pablo is collecting 

# left to do:
# 66:99
# 127:210
# 211:300
# 301:400

5.1.5 Tweet retweeters

I have paused this.

# tweets_retweets_likes <- tweets %>% 
#   select(author_id, conversation_id, text, public_metrics) %>% 
#   filter(!str_detect(text, "RT @"))
# 
# tweets_id <- tweets_retweets_likes$conversation_id
# 
# # the function hydrate_tweets() is not so useful, instead the one below is good, it includes geo location of retweeters: started running at 13:48 8/2
# tweets_retweets <- data.frame()
# for (id in tweets_id) {
#   iteration_x <- get_retweeted_by(
#     id,
#     bearer_token = get_bearer(),
#     data_path = "retweets/",
#     verbose = TRUE
#   )
#   tweets_retweets <- bind_rows(tweets_retweets, iteration_x)
# }
# 
# saveRDS(tweets_retweets, "tweets_retweets.RDS")

5.1.6 Tweet likers