About this vignette
This vignette is a bit of a sandbox for the experimentation with plots! Currently it is using the working example data (same data used in the “Source Analysis Across Screening Phases” vignette). The focus is on plots that we are exploring.
If you have any questions, feedback, ideas, etc. about this vignette or others be sure to check out our discussion board on github! https://github.com/ESHackathon/CiteSource/discussions/100
1. Initial setup
#Load the necessary libraries
library(CiteSource)
library(dplyr)
library(ggplot2)
#Import citation files from a folder
citation_files <- list.files(path = file.path("../vignettes/working_example_data"), pattern = "\\.ris", full.names = TRUE)
#Print citation_files to double check the order in which R imported our files.
citation_files
# Set the path to the directory containing the citation files
file_path <- "../vignettes/working_example_data/"
metadata_tbl <- tibble::tribble(
~files, ~cite_sources, ~cite_labels,
"AGRIS.ris", "AGRIS", "search",
"CAB.ris", "CAB", "search",
"EconLit.ris", "EconLit", "search",
"Final.ris", NA, "final",
"GreenFile.ris", "GreenFile", "search",
"McK.ris", "Method1", "search",
"RM.ris", "Method2", "search",
"TiAb.ris", NA, "screened",
"WoS_early.ris", "WoS", "search",
"WoS_later.ris", "WoS", "search"
) %>%
dplyr::mutate(files = paste0(file_path, files))
citations <- read_citations(metadata = metadata_tbl)
# Deduplication & Identifying Crossover Records
unique_citations <- dedup_citations(citations)
# Count number of unique and non-unique citations from different sources and labels
n_unique <- count_unique(unique_citations)
# Create dataframe indicating occurrence of records across sources
source_comparison <- compare_sources(unique_citations, comp_type = "sources")
2. Current plots
Heatmaps
my_heatmap <- plot_source_overlap_heatmap(source_comparison)
my_heatmap
# Plot overlap as a heatmap matrix as percentage
my_heatmap_percent <- plot_source_overlap_heatmap(source_comparison, plot_type = "percentages")
my_heatmap_percent
Upset plot
# Citation duplication across multitiple resources
my_upset_plot <- plot_source_overlap_upset(source_comparison, decreasing = c(TRUE, TRUE))
#> Plotting a large number of groups. Consider reducing nset or sub-setting the data.
my_upset_plot
Unique/Crossover Bar Plot
# Unique vs. Duplicate sources across Search/TiAb Screening/Final Inclusion
my_contributions <- plot_contributions(n_unique,
center = TRUE,
bar_order = c("search", "screened", "final")
)
my_contributions
## 3. Prep for expirimental plots ### Unique records count from
sources
#Get unique records from each source and add bibliographic data
unique_AGRIS <- n_unique %>%
dplyr::filter(cite_source=="AGRIS", unique == TRUE) %>%
inner_join(unique_citations, by = "duplicate_id")
unique_CAB <- n_unique %>%
dplyr::filter(cite_source=="CAB", unique == TRUE) %>%
inner_join(unique_citations, by = "duplicate_id")
unique_EconLit <- n_unique %>%
dplyr::filter(cite_source=="EconLit", unique == TRUE) %>%
inner_join(unique_citations, by = "duplicate_id")
unique_WoS <- n_unique %>%
dplyr::filter(cite_source=="WoS", unique == TRUE) %>%
inner_join(unique_citations, by = "duplicate_id")
unique_Method1 <- n_unique %>%
dplyr::filter(cite_source=="Method1", unique == TRUE) %>%
inner_join(unique_citations, by = "duplicate_id")
unique_Method2 <- n_unique %>%
dplyr::filter(cite_source=="Method2", unique == TRUE) %>%
inner_join(unique_citations, by = "duplicate_id")
all_unique <- bind_rows(unique_AGRIS, unique_CAB, unique_EconLit,
unique_WoS, unique_Method1, unique_Method2)
4. Expirimental plots
Publication year bar plots
# Filter the "all_unique" dataset to only include records with a valid year
# Create a new column to represent the year
# Group the data by source and year and calculate the count of records for each group
year_data <- all_unique %>%
dplyr::mutate(year = as.character(year)) %>% # Convert "year" column to character format
dplyr::filter(stringr::str_detect(year, "^[0-9]+$")) %>% # Remove rows where "year" is not numeric
dplyr::mutate(year = as.numeric(year)) %>% # Convert "year" column back to numeric format
dplyr::filter(year > 1965) %>% # Remove rows where "year" is 1965 or earlier
group_by(cite_source.x, year) %>% # Group data by "cite_source.x" and "year"
summarise(count = n(), .groups = "drop") # Calculate the number of records in each group
# Create a bar plot to visualize the filtered and grouped data
unique_yearplot <- ggplot(year_data, aes(x=year, y=count, fill = cite_source.x)) +
geom_bar(position = "stack", stat = "identity") + # Create a stacked bar plot with actual count values
scale_x_continuous(breaks = seq(1966, 2023, by = 5))+
coord_cartesian(ylim = c(0, 600)) + # Set y-axis limits to ensure all bars are visible
xlab("Publication year") + # Add x-axis label
ylab("Unique records") + # Add y-axis label
labs(fill = "Source") # Add legend label for the fill color
unique_yearplot
interactive publication year bar plot
# Creating an interactive plot with plotly
library(plotly)
#>
#> Attaching package: 'plotly'
#> The following object is masked from 'package:ggplot2':
#>
#> last_plot
#> The following object is masked from 'package:stats':
#>
#> filter
#> The following object is masked from 'package:graphics':
#>
#> layout
unique_year_funplot <- ggplotly(unique_yearplot, tooltip = c("cite_source.x", "year", "count"), dynamicTicks = TRUE)
unique_year_funplot
Sankey plot idea
#further work needed to integrate Unique_citations into this plot, as this is very manual, but great as an example as to how the three custom fields can be visualized. In this case Strings/Sources/Timeline (Inclusion/Exclusion)
library(networkD3)
# Define nodes
nodes <- data.frame(
name = c("AG 1", "AG 2", "AGRIS",
"CAB 1", "CAB 2", "CAB",
"EL 1", "EL 2", "EconLit",
"WoS 1", "WoS 2", "WoS",
"Method 1", "Method 2",
"TI/AB Included", "Final Included", "Excluded", "NA"),
id = 0:17
)
# Define links
links <- data.frame(
source = c(0, 1, 3, 4, 6, 7, 9, 10),
target = c(2, 2, 5, 5, 8, 8, 11, 11),
value = c(100, 150, 100, 150, 100, 150, 100, 100)
)
# Add links for the second column nodes (including Method 1 and Method 2) flowing into TI/AB Included and Excluded
links <- rbind(links, data.frame(
source = c(2, 5, 8, 11, 2, 5, 8, 11, 12, 12, 13, 13),
target = c(14, 14, 14, 14, 16, 16, 16, 16, 14, 16, 14, 16),
value = c(50, 50, 50, 50, 200, 200, 200, 150, 50, 50, 50, 50)
))
# Add links for TI/AB Included flowing into Final Included and Excluded
links <- rbind(links, data.frame(
source = c(14),
target = c(15),
value = c(100)
))
links <- rbind(links, data.frame(
source = c(14),
target = c(16),
value = c(200)
))
nodes <- rbind(nodes, data.frame(
name = c("", ""),
id = 17:18
))
links <- rbind(links, data.frame(
source = c(17, 17),
target = c(12, 13),
value = c(NA, NA)
))
# Create Sankey diagram
sankey<-sankeyNetwork(
Links = links,
Nodes = nodes,
Source = "source",
Target = "target",
Value = "value",
NodeID = "name",
units = "",
fontSize = 12,
nodeWidth = 20,
iterations = 0
)
sankey
5. Expirimental source-specific tables
Unique Journal Count per Source
library(gt)
#Exploritory Plot - The following table and functions are not yet integrated into CiteSource
# Create a function to group by journal, count unique citations, and arrange in descending order
# This function takes a data frame as input, and returns a processed data frame
process_journals <- function(data) {
data %>%
group_by(journal) %>% # Group the data by journal
summarise(count = n()) %>%
dplyr::filter(journal != "") %>% # Filter out any blank journal titles
arrange(desc(count)) # Arrange the journals in descending order of count
}
# Process Journal Titles
AGRIS_journals <- process_journals(unique_AGRIS)
CAB_journals <- process_journals(unique_CAB)
EconLit_journals <- process_journals(unique_EconLit)
WoS_journals <- process_journals(unique_WoS)
Method1_journals <- process_journals(unique_Method1)
Method2_journals <- process_journals(unique_Method2)
# Create a function to display a gt table
# This function takes a data frame and a caption as input, and returns a gt table
display_gt_table <- function(data, caption) {
data %>%
gt() %>%
tab_header(title = caption) %>% # Set the table title to the provided caption
cols_label(journal = "Journal", count = "Unique Citations") %>% # Set column labels
tab_options(
table.width = px(600),
table.font.size = px(12),
heading.title.font.size = px(16),
heading.subtitle.font.size = px(14)
) # Set table styling options
}
Example of Top 10 journals
display_gt_table(CAB_journals[1:10, ], "Top 10 Journals by Unique Citations CAB")
Top 10 Journals by Unique Citations CAB | |
Journal | Unique Citations |
---|---|
Journal of Ethnobiology and Ethnomedicine | 14 |
Agroforestry Systems | 10 |
International Journal of Forest Usufructs Management | 10 |
Indian Forester | 9 |
Indian Journal of Traditional Knowledge | 7 |
RAP Publication | 7 |
Science of the Total Environment | 7 |
Water Practice & Technology | 7 |
Ethnobotany Research and Applications | 6 |
Forests, Trees and Livelihoods | 6 |
display_gt_table(WoS_journals[1:10, ], "Top 10 Journals by Unique Citations WoS")
Top 10 Journals by Unique Citations WoS | |
Journal | Unique Citations |
---|---|
Forest Policy and Economics | 160 |
International Forestry Review | 110 |
Ecological Economics | 69 |
Forest Ecology and Management | 69 |
Land Use Policy | 69 |
Small-Scale Forestry | 57 |
Society \\& Natural Resources | 50 |
World Development | 48 |
Agroforestry Systems | 45 |
Forests | 45 |
6. Current tables
Citation Record Table
# Citation Record Table
unique_citations %>%
dplyr::filter(stringr::str_detect(cite_label, "final")) %>%
record_level_table(return = "DT")
7.New Tables
initial_counts<-record_counts(unique_citations, citations, "cite_source")
calculated_counts<-calculate_record_counts(unique_citations, citations, n_unique, "cite_source")
phase_counts<-calculate_phase_count(unique_citations, citations, "cite_source")
record_counts_table(initial_counts)
Record Counts | ||
Records Imported1 | Distinct Records2 | |
---|---|---|
AGRIS | 12 | 12 |
CAB | 687 | 686 |
EconLit | 50 | 50 |
GreenFile | 139 | 139 |
Method1 | 2656 | 2364 |
Method2 | 530 | 472 |
WoS | 3286 | 2989 |
Total | 7360 | 6712 |
1 Number of records imported from each source. | ||
2 Number of records after internal source deduplication |
record_summary_table(calculated_counts)
Record Counts | |||||||
Records Imported1 | Distinct Records2 | Unique records3 | Non-unique Records4 | Records Contributed %5 | Unique Records Contributed %6 | Unique Records %7 | |
---|---|---|---|---|---|---|---|
AGRIS | 12 | 12 | 12 | 0 | 0.2% | 0.3% | 100.0% |
CAB | 687 | 686 | 621 | 65 | 10.2% | 13.2% | 90.5% |
EconLit | 50 | 50 | 39 | 11 | 0.7% | 0.8% | 78.0% |
GreenFile | 139 | 139 | 65 | 74 | 2.1% | 1.4% | 46.8% |
Method1 | 2656 | 2364 | 1530 | 834 | 35.2% | 32.6% | 64.7% |
Method2 | 530 | 472 | 350 | 122 | 7.0% | 7.5% | 74.2% |
WoS | 3286 | 2989 | 2080 | 909 | 44.5% | 44.3% | 69.6% |
Total | 7360 | 6712 | 4697 | 2015 | NA | NA | NA |
1 Number of raw records imported from each database. | |||||||
2 Number of records after internal source deduplication | |||||||
3 Number of records not found in another source. | |||||||
4 Number of records found in at least one other source. | |||||||
5 Percent distinct records contributed to the total number of distinct records. | |||||||
6 Percent of unique records contributed to the total unique records. | |||||||
7 Percentage of records that were unique from each source. |
precision_sensitivity_table(phase_counts)
Record Counts & Precision/Sensitivity | |||||
Distinct Records1 | Screened Included2 | Final Included3 | Precision4 | Sensitivity/Recall5 | |
---|---|---|---|---|---|
AGRIS | 12 | 0 | 0 | 0 | 0 |
CAB | 686 | 114 | 4 | 0.58 | 1.1 |
EconLit | 50 | 16 | 3 | 6 | 0.83 |
GreenFile | 139 | 41 | 4 | 2.88 | 1.1 |
Method1 | 2364 | 868 | 162 | 6.85 | 44.63 |
Method2 | 472 | 197 | 67 | 14.19 | 18.46 |
WoS | 2989 | 873 | 123 | 4.12 | 33.88 |
Total | 6712 | 6 1573 | 7 242 | - | - |
1 Number of source specific unique records | |||||
2 Records included after title/abstract screening | |||||
3 Records included after full text screening | |||||
4 Precision = Final Included / Distinct Records | |||||
5 Sensitivity/Recall = Final Included / Total Final Included | |||||
6 This is the total for Screened Included | |||||
7 This is the total for Final Included |