Mfte

Latest version: v1.0.0.7

Safety actively analyzes 723217 Python packages for vulnerabilities to keep your Python projects secure.

1.0.0

1. Freeze experimental features

Command to import all COCA files from directory with fileID, register and corpus
get file names from directory
files <- list.files(here("evaluation"))
split to save names; name for data frame will be first element
names <- strsplit(files, "\\.")
now get the files
for (i in 1:length(files)) { for each file in the list
fileName <- files[[i]] save filename of element i
dataName <- names[[i]][[1]] save data name of element i
tempData <- importEval(file = read_excel(here("evaluation", "fileName"), col_types = "text"), fileID = dataName, register = "spoken", corpus = "COCA")
assign (dataName, tempData, envir=.GlobalEnv) assign the results of file to the data named

}
COCA_acad_4000541 <- importEval(file = read_excel(here("evaluation", "COCA_acad_4000541.xlsx"), col_types = "text"), fileID = "COCA_acad_4000541", register = "academic", corpus = "COCA")
COCA_acad_4017541 <- importEval(file = read_excel(here("evaluation", "COCA_acad_4017541.xlsx"), col_types = "text"), fileID = "COCA_acad_4017541", register = "academic", corpus = "COCA")
COCA_acad_4170341 <- importEval(file = read_excel(here("evaluation", "COCA_acad_4170341.xlsx"), col_types = "text"), fileID = "COCA_acad_4170341", register = "academic", corpus = "COCA")
COCA_blog_5157941 <- importEval(file = read_excel(here("evaluation", "COCA_blog_5157941.xlsx"), col_types = "text"), fileID = "COCA_blog_5157941", register = "e-language", corpus = "COCA")
COCA_blog_5174141 <- importEval(file = read_excel(here("evaluation", "COCA_blog_5174141.xlsx"), col_types = "text"), fileID = "COCA_blog_5174141", register = "e-language", corpus = "COCA")
COCA_blog_5176541 <- importEval(file = read_excel(here("evaluation", "COCA_blog_5176541.xlsx"), col_types = "text"), fileID = "COCA_blog_5176541", register = "e-language", corpus = "COCA")
COCA_fict_1000441 <- importEval(file = read_excel(here("evaluation", "COCA_fict_1000441.xlsx"), col_types = "text"), fileID = "COCA_fict_1000441", register = "fiction", corpus = "COCA")
COCA_fict_1003141 <- importEval(file = read_excel(here("evaluation", "COCA_fict_1003141.xlsx"), col_types = "text"), fileID = "COCA_fict_1003141", register = "fiction", corpus = "COCA")
COCA_fict_5003241 <- importEval(file = read_excel(here("evaluation", "COCA_fict_5003241.xlsx"), col_types = "text"), fileID = "COCA_fict_5003241", register = "fiction", corpus = "COCA")
COCA_mag_2029741 <- importEval(file = read_excel(here("evaluation", "COCA_mag_2029741.xlsx"), col_types = "text"), fileID = "COCA_mag_2029741", register = "news", corpus = "COCA")
COCA_mag_2030941 <- importEval(file = read_excel(here("evaluation", "COCA_mag_2030941.xlsx"), col_types = "text"), fileID = "COCA_mag_2030941", register = "news", corpus = "COCA")
COCA_mag_4180341 <- importEval(file = read_excel(here("evaluation", "COCA_mag_4180341.xlsx"), col_types = "text"), fileID = "COCA_mag_4180341", register = "news", corpus = "COCA")
COCA_News_4087357 <- importEval(file = read_excel(here("evaluation", "COCA_News_4087357.xlsx"), col_types = "text"), fileID = "COCA_News_4087357", register = "news", corpus = "COCA")
COCA_News_4087464 <- importEval(file = read_excel(here("evaluation", "COCA_News_4087464.xlsx"), col_types = "text"), fileID = "COCA_News_4087464", register = "news", corpus = "COCA")
COCA_News_4087649 <- importEval(file = read_excel(here("evaluation", "COCA_News_4087649.xlsx"), col_types = "text"), fileID = "COCA_News_4087649", register = "news", corpus = "COCA")
COCA_News_4087995 <- importEval(file = read_excel(here("evaluation", "COCA_News_4087995.xlsx"), col_types = "text"), fileID = "COCA_News_4087995", register = "news", corpus = "COCA")
COCA_Opinion_4061065 <- importEval(file = read_excel(here("evaluation", "COCA_Opinion_4061065.xlsx"), col_types = "text"), fileID = "COCA_Opinion_4061065", register = "news", corpus = "COCA")
COCA_Opinion_4062489 <- importEval(file = read_excel(here("evaluation", "COCA_Opinion_4062489.xlsx"), col_types = "text"), fileID = "COCA_Opinion_4062489", register = "news", corpus = "COCA")
COCA_Opinion_4079063 <- importEval(file = read_excel(here("evaluation", "COCA_Opinion_4079063.xlsx"), col_types = "text"), fileID = "COCA_Opinion_4079063", register = "news", corpus = "COCA")
COCA_Opinion_4090647 <- importEval(file = read_excel(here("evaluation", "COCA_Opinion_4090647.xlsx"), col_types = "text"), fileID = "COCA_Opinion_4090647", register = "news", corpus = "COCA")
COCA_Spoken_4082518 <- importEval(file = read_excel(here("evaluation", "COCA_Spoken_4082518.xlsx"), col_types = "text"), fileID = "COCA_Spoken_4082518", register = "spoken", corpus = "COCA")
COCA_Spoken_4082551 <- importEval(file = read_excel(here("evaluation", "COCA_Spoken_4082551.xlsx"), col_types = "text"), fileID = "COCA_Spoken_4082551", register = "spoken", corpus = "COCA")
COCA_Spoken_4082571 <- importEval(file = read_excel(here("evaluation", "COCA_Spoken_4082571.xlsx"), col_types = "text"), fileID = "COCA_Spoken_4082571", register = "spoken", corpus = "COCA")
COCA_Spoken_4082646 <- importEval(file = read_excel(here("evaluation", "COCA_Spoken_4082646.xlsx"), col_types = "text"), fileID = "COCA_Spoken_4082646", register = "spoken", corpus = "COCA")
COCA_tvm_5208241 <- importEval(file = read_excel(here("evaluation", "COCA_tvm_5208241.xlsx"), col_types = "text"), fileID = "COCA_tvm_5208241", register = "TV/movies", corpus = "COCA")
COCA_tvm_5215441 <- importEval(file = read_excel(here("evaluation", "COCA_tvm_5215441.xlsx"), col_types = "text"), fileID = "COCA_tvm_5215441", register = "TV/movies", corpus = "COCA")
COCA_tvm_5246241 <- importEval(file = read_excel(here("evaluation", "COCA_tvm_5246241.xlsx"), col_types = "text"), fileID = "COCA_tvm_5246241", register = "TV/movies", corpus = "COCA")
COCA_web_5026941 <- importEval(file = read_excel(here("evaluation", "COCA_web_5026941.xlsx"), col_types = "text"), fileID = "COCA_web_5026941", register = "e-language", corpus = "COCA")
COCA_web_5035341 <- importEval(file = read_excel(here("evaluation", "COCA_web_5035341.xlsx"), col_types = "text"), fileID = "COCA_web_5035341", register = "e-language", corpus = "COCA")
COCA_web_5080941 <- importEval(file = read_excel(here("evaluation", "COCA_web_5080941.xlsx"), col_types = "text"), fileID = "COCA_web_5080941", register = "e-language", corpus = "COCA")
Command to rbind all COCA and BNC R objects in the local environment
list_of_dataframes <- objects(pattern = "BNC|COCA")
list_of_dataframes <- toString(objects(pattern = "BNC|COCA"))
list_of_dataframes
EvalData <- rbind(BNC_AcaHumBk34, BNC_BAcjH78, BNC_BAcjM107, BNC_BEBl293, BNC_BEEm76, BNC_BERe31, BNC_BFict_b2, BNC_BMass311, BNC_BReg495, BNC_BSer145, BNC_ElanBlogBla12, BNC_ElanBlogSlu30, BNC_ElanEmail102, BNC_ElanForumCar5, BNC_ElanForumRig1, BNC_ElanRev27, BNC_ElanSms33, BNC_ElanSocFac4_pt1, BNC_ElanSocTwi49_pt7, BNC_ElanSocTwi6_pt4, BNC_FictFan41, BNC_FictMis228, BNC_MagAut1397, BNC_MagPc275, BNC_NewMaDas2819, BNC_NewReBet1393, BNC_NewSeGua553, BNC_Sp2m0f33, BNC_Sp2m2f63, BNC_Sp3m1f10, COCA_acad_4000541, COCA_acad_4017541, COCA_acad_4170341, COCA_blog_5157941, COCA_blog_5174141, COCA_blog_5176541, COCA_fict_1000441, COCA_fict_1003141, COCA_fict_5003241, COCA_mag_2029741, COCA_mag_2030941, COCA_mag_4180341, COCA_News_4087357, COCA_News_4087464, COCA_News_4087649, COCA_News_4087995, COCA_Opinion_4061065, COCA_Opinion_4062489, COCA_Opinion_4079063, COCA_Opinion_4090647, COCA_Spoken_4082518, COCA_Spoken_4082551, COCA_Spoken_4082571, COCA_Spoken_4082646, COCA_tvm_5208241, COCA_tvm_5215441, COCA_tvm_5246241, COCA_web_5026941, COCA_web_5035341, COCA_web_5080941)
summary(EvalData)
unique(EvalData$FileID)
unique(EvalData$TagGold)
unique(EvalData$Tag)
EvalData <- EvalData %>%
mutate(TagGold = ifelse(TagGold == "none", "NONE", as.character(TagGold))) %>%
mutate(TagGold = as.factor(ifelse(TagGold == "unclear", "UNCLEAR", as.character(TagGold))))
saveRDS(EvalData, here("evaluation", "MFTE_Python_Eval_Results.rds")) Last saved 9 August 2023
write.csv(EvalData, here("evaluation", "MFTE_Python_Eval_Results.csv")) Last saved 9 August 2023
saveRDS(EvalData, here("evaluation", "MFTE_Python_Eval_Results.rds")) Last saved 10 August 2023
write.csv(EvalData, here("evaluation", "MFTE_Python_Eval_Results.csv")) Last saved 9 August 2023
nrow(EvalData)
summary(EvalData$TagGold) 293 UNCLEAR
BinomCI(293, 61140,

0.95

sides = "two.sided",
method = "wilsoncc") * 100
Number of tokens evaluated per corpus and register subcorpus
EvalData %>%
group_by(Corpus, Register) %>%
count() %>%
arrange(-n) %>%
as.data.frame()
EvalData %>%
group_by(Corpus, Register) %>%
count(FileID) %>%
print(n = 100)

0.3

data_filtered1 <- EvalData %>%
filter(!TagGold %in% c("UNCLEAR","unclear")) %>%
filter(!TagGold %in% c("ACT", "NFP", "GW", "HYPH", "ADD", "AFX", "FW", "WQ", "SYM")) %>%
filter(TagGold %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% Remove all punctuation tags which are uninteresting here.
add_count(Tag, name = "n_tagged") %>%
add_count(TagGold, name = "n_tagged_gold") %>%
filter(
n_tagged >= min_n,
n_tagged_gold >= min_n)
tags_remaining <- union(
unique(data_filtered1$Tag),
unique(data_filtered1$TagGold)
)
data_filtered2 <- data_filtered1 %>%
mutate(
Tag = factor(Tag, levels = tags_remaining),
TagGold = factor(TagGold, levels = tags_remaining)) %>%
arrange(TagGold)
error_fig <- data_filtered2 %>%
ggplot(aes(x = TagGold, y = Tag, colour = Evaluation)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = "none") +
scale_color_manual(values = c("red2", "chartreuse3")) +
coord_fixed() +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_jitter(
aes(size = n_tagged_gold),
width = jitter_dist,
height = jitter_dist,
alpha = opacity)
ggsave(here("plots", "TaggerErrorMatrix.svg"), width = 9, height = 9)
registerEval <- function(data, register) {
d <- data %>% filter(Register==register)
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
return(round((cm$overall*100), 2))
return(cm$byClass[,5:7])
}
summary(data$Register)
registerEval(data, "e-language")
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
registerEval <- function(data, register) {
d <- data %>% filter(Register==register)
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
return(round((cm$overall*100), 2))
return(cm$byClass[,5:7])
}
registerEval(data, "e-language")
registerEval <- function(data, register) {
d <- data %>% filter(Register==register)
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
return(round((cm$overall*100), 2))
return(cm$byClass[,5:7])
}
?cm
?confusionMatrix
registerEval(data, "e-language")
registerEval(data, "academic")
registerEval(data, "fiction")
registerEval(data, "news")
registerEval(data, "spoken")
registerEval(data, "TV/movies")
varietyEval <- function(data, variety) {
d <- data %>% filter(Corpus==variety)
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
return(round((cm$overall*100), 2))
return(cm$byClass[,5:7])
}
varietyEval(data, "BNC2014")
varietyEval(data, "COCA")
for(i in unique(data$Register)){
print((
fig %+% filter(data, Register == i)) +
ggtitle(i)
)
}
registerEval(data, "academic")
registerEval(data, "e-language")
registerEval(data, "fiction")
registerEval(data, "news")
registerEval(data, "spoken")
registerEval(data, "TV/movies")
dataSpoken <- data %>%
filter(Register=="spoken" | Register=="TV/Movies")
summary(dataSpoken$Register)
dataSpoken <- data %>%
filter(Register=="spoken" | Register=="TV/movies")
summary(dataSpoken$Register)
cmSpoken <- caret::confusionMatrix(dataSpoken$Tag, dataSpoken$TagGold)
round((cmSpoken$overall*100), 2)
round((cmSpoken$overall*100), 2)
fileEval <- function(data, file) {
d <- data %>% filter(FileID==file) %>%
Ensure that the factor levels are the same for the next caret operation
mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>%
mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
return(cm$overall)
return(cm$byClass[,5:7])
}
levels(data$FileID)
fileEval(data, "COCA_Opinion_4079063")
fileEval(data, "BNC_BAcjH78")
Adding an error tag with the incorrectly assigned tag and underscore and then the correct "gold" label
errors <- EvalDat2 %>%
filter(Evaluation=="FALSE") %>%
filter(TagGold != "UNCLEAR") %>%
mutate(Error = paste(Tag, TagGold, sep = " -> "))
Adding an error tag with the incorrectly assigned tag and underscore and then the correct "gold" label
errors <- EvalDat %>%
filter(Evaluation=="FALSE") %>%
filter(TagGold != "UNCLEAR") %>%
mutate(Error = paste(Tag, TagGold, sep = " -> "))
Adding an error tag with the incorrectly assigned tag and underscore and then the correct "gold" label
errors <- EvalData %>%
filter(Evaluation=="FALSE") %>%
filter(TagGold != "UNCLEAR") %>%
mutate(Error = paste(Tag, TagGold, sep = " -> "))
Total number of errors
nrow(errors) 1199
Adding an error tag with the incorrectly assigned tag and underscore and then the correct "gold" label
errors <- data %>%
filter(Evaluation=="FALSE") %>%
mutate(Error = paste(Tag, TagGold, sep = " -> "))
Total number of errors
nrow(errors) 1612
FreqErrors <- errors %>%
count(Error) %>%
arrange(desc(n))
FreqErrors %>%
group_by(Register) %>%
filter(n > 9) %>%
print.data.frame()
errors %>%
filter(Error == "NN -> JJAT") %>%
select(-Output, -Corpus, -Tag, -TagGold) %>%
filter(grepl(x = Token, pattern = "[A-Z]+.")) %>%
print.data.frame()
errors %>%
filter(Error %in% c("NN -> VB", "VB -> NN", "NN -> VPRT", "VPRT -> NN")) %>%
count(Token) %>%
arrange(desc(n)) %>%
print.data.frame()
errors %>%
filter(Error == "NN -> JJPR") %>%
count(Token) %>%
filter(grepl(x = Token, pattern = "[A-Z]+.")) %>%
arrange(desc(n)) %>%
print.data.frame()
errors %>%
filter(Error == "ACT -> NULL") %>%
count(Token) %>%
arrange(desc(n)) %>%
print.data.frame()
errors %>%
filter(Error == "NCOMP -> NULL") %>%
count(Token) %>%
arrange(desc(n)) %>%
print.data.frame()
errors %>%
filter(Error == "NCOMP -> NULL") %>%
count(Token) %>%
arrange(desc(n)) %>%
print.data.frame()
errors %>%
filter(Error == "NCOMP -> NONE") %>%
count(Token) %>%
arrange(desc(n)) %>%
print.data.frame()
EvalData <- readRDS(here("evaluation", "MFTE_Python_Eval_Results.rds"))
summary(EvalData)
Total number of tags manually checked
nrow(EvalData) 61140
Number of tags evaluated per file
EvalData %>%
group_by(FileID) %>%
count(.) %>%
arrange(desc(n))
Number of UNCLEAR Token
EvalData %>%
filter(TagGold %in% c("UNCLEAR")) %>%
count()
BinomCI(293, 61140,

0.2

Releases

Has known vulnerabilities

Mfte

Page 1 of 1

1.0.0

0.95

0.3

0.2

Page 1 of 1

Links

Releases