Skip to contents

About this vignette

This vignette is a bit of a sandbox for the experimentation with plots! Currently it is using the working example data (same data used in the “Source Analysis Across Screening Phases” vignette). The focus is on plots that we are exploring.

If you have any questions, feedback, ideas, etc. about this vignette or others be sure to check out our discussion board on github! https://github.com/ESHackathon/CiteSource/discussions/100

1. Initial setup

#Load the necessary libraries
library(CiteSource)
library(dplyr)
library(ggplot2)

#Import citation files from a folder
citation_files <- list.files(path = file.path("../vignettes/working_example_data"), pattern = "\\.ris", full.names = TRUE)
#Print citation_files to double check the order in which R imported our files.
citation_files
# Set the path to the directory containing the citation files
file_path <- "../vignettes/working_example_data/"

metadata_tbl <- tibble::tribble(
  ~files,           ~cite_sources, ~cite_labels, 
   "AGRIS.ris",      "AGRIS",       "search",    
   "CAB.ris",        "CAB",         "search",    
   "EconLit.ris",    "EconLit",     "search",    
   "Final.ris",       NA,           "final",     
   "GreenFile.ris",  "GreenFile",   "search",    
   "McK.ris",        "Method1",     "search",    
   "RM.ris",         "Method2",     "search",    
   "TiAb.ris",        NA,           "screened",  
   "WoS_early.ris",  "WoS",         "search",    
   "WoS_later.ris",  "WoS",         "search"
) %>% 

dplyr::mutate(files = paste0(file_path, files))
citations <- read_citations(metadata = metadata_tbl)
# Deduplication & Identifying Crossover Records
unique_citations <- dedup_citations(citations)
# Count number of unique and non-unique citations from different sources and labels
n_unique <- count_unique(unique_citations)
# Create dataframe indicating occurrence of records across sources
source_comparison <- compare_sources(unique_citations, comp_type = "sources")

2. Current plots

Heatmaps

my_heatmap <- plot_source_overlap_heatmap(source_comparison)

my_heatmap


# Plot overlap as a heatmap matrix as percentage

my_heatmap_percent <- plot_source_overlap_heatmap(source_comparison, plot_type = "percentages")

my_heatmap_percent

Upset plot

# Citation duplication across multitiple resources

my_upset_plot <- plot_source_overlap_upset(source_comparison, decreasing = c(TRUE, TRUE))
#> Plotting a large number of groups. Consider reducing nset or sub-setting the data.

my_upset_plot

Unique/Crossover Bar Plot

# Unique vs. Duplicate sources across Search/TiAb Screening/Final Inclusion

my_contributions <- plot_contributions(n_unique,
  center = TRUE,
  bar_order = c("search", "screened", "final")
)

my_contributions

## 3. Prep for expirimental plots ### Unique records count from sources

#Get unique records from each source and add bibliographic data
unique_AGRIS <- n_unique %>% 
  dplyr::filter(cite_source=="AGRIS", unique == TRUE) %>%
  inner_join(unique_citations, by = "duplicate_id")

unique_CAB <- n_unique %>% 
  dplyr::filter(cite_source=="CAB", unique == TRUE) %>%
  inner_join(unique_citations, by = "duplicate_id")

unique_EconLit <- n_unique %>% 
  dplyr::filter(cite_source=="EconLit", unique == TRUE) %>%
  inner_join(unique_citations, by = "duplicate_id")

unique_WoS <- n_unique %>% 
  dplyr::filter(cite_source=="WoS", unique == TRUE) %>%
  inner_join(unique_citations, by = "duplicate_id")

unique_Method1 <- n_unique %>% 
  dplyr::filter(cite_source=="Method1", unique == TRUE) %>%
  inner_join(unique_citations, by = "duplicate_id")

unique_Method2 <- n_unique %>% 
  dplyr::filter(cite_source=="Method2", unique == TRUE) %>%
  inner_join(unique_citations, by = "duplicate_id")

all_unique <- bind_rows(unique_AGRIS, unique_CAB, unique_EconLit,
                        unique_WoS, unique_Method1, unique_Method2)

4. Expirimental plots

Publication year bar plots


# Filter the "all_unique" dataset to only include records with a valid year
# Create a new column to represent the year
# Group the data by source and year and calculate the count of records for each group
year_data <- all_unique %>%
  dplyr::mutate(year = as.character(year)) %>%  # Convert "year" column to character format
  dplyr::filter(stringr::str_detect(year, "^[0-9]+$")) %>%  # Remove rows where "year" is not numeric
  dplyr::mutate(year = as.numeric(year)) %>%  # Convert "year" column back to numeric format
  dplyr::filter(year > 1965) %>%  # Remove rows where "year" is 1965 or earlier
  group_by(cite_source.x, year) %>%  # Group data by "cite_source.x" and "year"
  summarise(count = n(), .groups = "drop")  # Calculate the number of records in each group


# Create a bar plot to visualize the filtered and grouped data
unique_yearplot <- ggplot(year_data, aes(x=year, y=count, fill = cite_source.x)) +
  geom_bar(position = "stack", stat = "identity") +  # Create a stacked bar plot with actual count values
scale_x_continuous(breaks = seq(1966, 2023, by = 5))+

  coord_cartesian(ylim = c(0, 600)) +  # Set y-axis limits to ensure all bars are visible
  xlab("Publication year") +  # Add x-axis label
  ylab("Unique records") +  # Add y-axis label
  labs(fill = "Source")  # Add legend label for the fill color

unique_yearplot

interactive publication year bar plot

# Creating an interactive plot with plotly
library(plotly)
#> 
#> Attaching package: 'plotly'
#> The following object is masked from 'package:ggplot2':
#> 
#>     last_plot
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> The following object is masked from 'package:graphics':
#> 
#>     layout

unique_year_funplot <- ggplotly(unique_yearplot, tooltip = c("cite_source.x", "year", "count"), dynamicTicks = TRUE)

unique_year_funplot

Sankey plot idea

#further work needed to integrate Unique_citations into this plot, as this is very manual, but great as an example as to how the three custom fields can be visualized. In this case Strings/Sources/Timeline (Inclusion/Exclusion)

library(networkD3)

# Define nodes
nodes <- data.frame(
  name = c("AG 1", "AG 2", "AGRIS", 
           "CAB 1", "CAB 2", "CAB",
           "EL 1", "EL 2", "EconLit", 
           "WoS 1", "WoS 2", "WoS", 
           "Method 1", "Method 2", 
           "TI/AB Included", "Final Included", "Excluded", "NA"),
  id = 0:17
)

# Define links
links <- data.frame(
  source = c(0, 1, 3, 4, 6, 7, 9, 10),
  target = c(2, 2, 5, 5, 8, 8, 11, 11),
  value = c(100, 150, 100, 150, 100, 150, 100, 100)
)

# Add links for the second column nodes (including Method 1 and Method 2) flowing into TI/AB Included and Excluded
links <- rbind(links, data.frame(
  source = c(2, 5, 8, 11, 2, 5, 8, 11, 12, 12, 13, 13),
  target = c(14, 14, 14, 14, 16, 16, 16, 16, 14, 16, 14, 16),
  value = c(50, 50, 50, 50, 200, 200, 200, 150, 50, 50, 50, 50)
))

# Add links for TI/AB Included flowing into Final Included and Excluded
links <- rbind(links, data.frame(
  source = c(14),
  target = c(15),
  value = c(100)
))

links <- rbind(links, data.frame(
  source = c(14),
  target = c(16),
  value = c(200)
))

nodes <- rbind(nodes, data.frame(
  name = c("", ""),
  id = 17:18
))

links <- rbind(links, data.frame(
  source = c(17, 17),
  target = c(12, 13),
  value = c(NA, NA)
))

# Create Sankey diagram
sankey<-sankeyNetwork(
  Links = links,
  Nodes = nodes,
  Source = "source",
  Target = "target",
  Value = "value",
  NodeID = "name",
  units = "",
  fontSize = 12,
  nodeWidth = 20,
  iterations = 0
)
sankey

5. Expirimental source-specific tables

Unique Journal Count per Source

library(gt)
#Exploritory Plot - The following table and functions are not yet integrated into CiteSource

# Create a function to group by journal, count unique citations, and arrange in descending order
# This function takes a data frame as input, and returns a processed data frame
process_journals <- function(data) {
  data %>%
    group_by(journal) %>%          # Group the data by journal
    summarise(count = n()) %>%
    dplyr::filter(journal != "") %>%  # Filter out any blank journal titles
    arrange(desc(count))             # Arrange the journals in descending order of count
}

# Process Journal Titles
AGRIS_journals <- process_journals(unique_AGRIS)
CAB_journals <- process_journals(unique_CAB) 
EconLit_journals <- process_journals(unique_EconLit) 
WoS_journals <- process_journals(unique_WoS)
Method1_journals <- process_journals(unique_Method1)
Method2_journals <- process_journals(unique_Method2)

# Create a function to display a gt table
# This function takes a data frame and a caption as input, and returns a gt table
display_gt_table <- function(data, caption) {
  data %>%
    gt() %>%
    tab_header(title = caption) %>%  # Set the table title to the provided caption
    cols_label(journal = "Journal", count = "Unique Citations") %>%  # Set column labels
    tab_options(
      table.width = px(600),
      table.font.size = px(12),
      heading.title.font.size = px(16),
      heading.subtitle.font.size = px(14)
    )                                   # Set table styling options
}

Example of Top 10 journals

display_gt_table(CAB_journals[1:10, ], "Top 10 Journals by Unique Citations CAB")
Top 10 Journals by Unique Citations CAB
Journal Unique Citations
Journal of Ethnobiology and Ethnomedicine 14
Agroforestry Systems 10
International Journal of Forest Usufructs Management 10
Indian Forester 9
Indian Journal of Traditional Knowledge 7
RAP Publication 7
Science of the Total Environment 7
Water Practice & Technology 7
Ethnobotany Research and Applications 6
Forests, Trees and Livelihoods 6

display_gt_table(WoS_journals[1:10, ], "Top 10 Journals by Unique Citations WoS")
Top 10 Journals by Unique Citations WoS
Journal Unique Citations
Forest Policy and Economics 160
International Forestry Review 110
Ecological Economics 69
Forest Ecology and Management 69
Land Use Policy 69
Small-Scale Forestry 57
Society \\& Natural Resources 50
World Development 48
Agroforestry Systems 45
Forests 45

6. Current tables

Citation Record Table

# Citation Record Table

unique_citations %>%
  dplyr::filter(stringr::str_detect(cite_label, "final")) %>%
  record_level_table(return = "DT")

7.New Tables

initial_counts<-record_counts(unique_citations, citations, "cite_source")
  
calculated_counts<-calculate_record_counts(unique_citations, citations, n_unique, "cite_source")
  
phase_counts<-calculate_phase_count(unique_citations, citations, "cite_source")

record_counts_table(initial_counts)
Record Counts
Records Imported1 Distinct Records2
AGRIS 12 12
CAB 687 686
EconLit 50 50
GreenFile 139 139
Method1 2656 2364
Method2 530 472
WoS 3286 2989
Total 7360 6712
1 Number of records imported from each source.
2 Number of records after internal source deduplication
record_summary_table(calculated_counts)
Record Counts
Records Imported1 Distinct Records2 Unique records3 Non-unique Records4 Records Contributed %5 Unique Records Contributed %6 Unique Records %7
AGRIS 12 12 12 0 0.2% 0.3% 100.0%
CAB 687 686 621 65 10.2% 13.2% 90.5%
EconLit 50 50 39 11 0.7% 0.8% 78.0%
GreenFile 139 139 65 74 2.1% 1.4% 46.8%
Method1 2656 2364 1530 834 35.2% 32.6% 64.7%
Method2 530 472 350 122 7.0% 7.5% 74.2%
WoS 3286 2989 2080 909 44.5% 44.3% 69.6%
Total 7360 6712 4697 2015 NA NA NA
1 Number of raw records imported from each database.
2 Number of records after internal source deduplication
3 Number of records not found in another source.
4 Number of records found in at least one other source.
5 Percent distinct records contributed to the total number of distinct records.
6 Percent of unique records contributed to the total unique records.
7 Percentage of records that were unique from each source.
Record Counts & Precision/Sensitivity
Distinct Records1 Screened Included2 Final Included3 Precision4 Sensitivity/Recall5
AGRIS 12 0 0 0 0
CAB 686 114 4 0.58 1.1
EconLit 50 16 3 6 0.83
GreenFile 139 41 4 2.88 1.1
Method1 2364 868 162 6.85 44.63
Method2 472 197 67 14.19 18.46
WoS 2989 873 123 4.12 33.88
Total 6712 6 1573 7 242 - -
1 Number of source specific unique records
2 Records included after title/abstract screening
3 Records included after full text screening
4 Precision = Final Included / Distinct Records
5 Sensitivity/Recall = Final Included / Total Final Included
6 This is the total for Screened Included
7 This is the total for Final Included