Mfte

Latest version: v1.0.0.4

Safety actively analyzes 624524 Python packages for vulnerabilities to keep your Python projects secure.

Scan your dependencies

1.0.0

1. Freeze experimental features

0.95

sides = "two.sided",
method = "wilsoncc") * 100
Number of tokens evaluated per corpus and register subcorpus
EvalData %>%
group_by(Corpus, Register) %>%
count() %>%
arrange(-n) %>%
as.data.frame()
EvalData %>%
group_by(Corpus, Register) %>%
count(FileID) %>%
print(n = 100)


CHANGELOG

0.3

data_filtered1 <- EvalData %>%
filter(!TagGold %in% c("UNCLEAR","unclear")) %>%
filter(!TagGold %in% c("ACT", "NFP", "GW", "HYPH", "ADD", "AFX", "FW", "WQ", "SYM")) %>%
filter(TagGold %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% Remove all punctuation tags which are uninteresting here.
add_count(Tag, name = "n_tagged") %>%
add_count(TagGold, name = "n_tagged_gold") %>%
filter(
n_tagged >= min_n,
n_tagged_gold >= min_n)
tags_remaining <- union(
unique(data_filtered1$Tag),
unique(data_filtered1$TagGold)
)
data_filtered2 <- data_filtered1 %>%
mutate(
Tag = factor(Tag, levels = tags_remaining),
TagGold = factor(TagGold, levels = tags_remaining)) %>%
arrange(TagGold)
error_fig <- data_filtered2 %>%
ggplot(aes(x = TagGold, y = Tag, colour = Evaluation)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = "none") +
scale_color_manual(values = c("red2", "chartreuse3")) +
coord_fixed() +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_jitter(
aes(size = n_tagged_gold),
width = jitter_dist,
height = jitter_dist,
alpha = opacity)
ggsave(here("plots", "TaggerErrorMatrix.svg"), width = 9, height = 9)
registerEval <- function(data, register) {
d <- data %>% filter(Register==register)
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
return(round((cm$overall*100), 2))
return(cm$byClass[,5:7])
}
summary(data$Register)
registerEval(data, "e-language")
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
registerEval <- function(data, register) {
d <- data %>% filter(Register==register)
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
return(round((cm$overall*100), 2))
return(cm$byClass[,5:7])
}
registerEval(data, "e-language")
registerEval <- function(data, register) {
d <- data %>% filter(Register==register)
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
return(round((cm$overall*100), 2))
return(cm$byClass[,5:7])
}
?cm
?confusionMatrix
registerEval(data, "e-language")
registerEval(data, "academic")
registerEval(data, "fiction")
registerEval(data, "news")
registerEval(data, "spoken")
registerEval(data, "TV/movies")
varietyEval <- function(data, variety) {
d <- data %>% filter(Corpus==variety)
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
return(round((cm$overall*100), 2))
return(cm$byClass[,5:7])
}
varietyEval(data, "BNC2014")
varietyEval(data, "COCA")
for(i in unique(data$Register)){
print((
fig %+% filter(data, Register == i)) +
ggtitle(i)
)
}
registerEval(data, "academic")
registerEval(data, "e-language")
registerEval(data, "fiction")
registerEval(data, "news")
registerEval(data, "spoken")
registerEval(data, "TV/movies")
dataSpoken <- data %>%
filter(Register=="spoken" | Register=="TV/Movies")
summary(dataSpoken$Register)
dataSpoken <- data %>%
filter(Register=="spoken" | Register=="TV/movies")
summary(dataSpoken$Register)
cmSpoken <- caret::confusionMatrix(dataSpoken$Tag, dataSpoken$TagGold)
round((cmSpoken$overall*100), 2)
round((cmSpoken$overall*100), 2)
fileEval <- function(data, file) {
d <- data %>% filter(FileID==file) %>%
Ensure that the factor levels are the same for the next caret operation
mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>%
mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))
cm <- caret::confusionMatrix(d$Tag, d$TagGold)
return(cm$overall)
return(cm$byClass[,5:7])
}
levels(data$FileID)
fileEval(data, "COCA_Opinion_4079063")
fileEval(data, "BNC_BAcjH78")
Adding an error tag with the incorrectly assigned tag and underscore and then the correct "gold" label
errors <- EvalDat2 %>%
filter(Evaluation=="FALSE") %>%
filter(TagGold != "UNCLEAR") %>%
mutate(Error = paste(Tag, TagGold, sep = " -> "))
Adding an error tag with the incorrectly assigned tag and underscore and then the correct "gold" label
errors <- EvalDat %>%
filter(Evaluation=="FALSE") %>%
filter(TagGold != "UNCLEAR") %>%
mutate(Error = paste(Tag, TagGold, sep = " -> "))
Adding an error tag with the incorrectly assigned tag and underscore and then the correct "gold" label
errors <- EvalData %>%
filter(Evaluation=="FALSE") %>%
filter(TagGold != "UNCLEAR") %>%
mutate(Error = paste(Tag, TagGold, sep = " -> "))
Total number of errors
nrow(errors) 1199
Adding an error tag with the incorrectly assigned tag and underscore and then the correct "gold" label
errors <- data %>%
filter(Evaluation=="FALSE") %>%
mutate(Error = paste(Tag, TagGold, sep = " -> "))
Total number of errors
nrow(errors) 1612
FreqErrors <- errors %>%
count(Error) %>%
arrange(desc(n))
FreqErrors %>%
group_by(Register) %>%
filter(n > 9) %>%
print.data.frame()
errors %>%
filter(Error == "NN -> JJAT") %>%
select(-Output, -Corpus, -Tag, -TagGold) %>%
filter(grepl(x = Token, pattern = "[A-Z]+.")) %>%
print.data.frame()
errors %>%
filter(Error %in% c("NN -> VB", "VB -> NN", "NN -> VPRT", "VPRT -> NN")) %>%
count(Token) %>%
arrange(desc(n)) %>%
print.data.frame()
errors %>%
filter(Error == "NN -> JJPR") %>%
count(Token) %>%
filter(grepl(x = Token, pattern = "[A-Z]+.")) %>%
arrange(desc(n)) %>%
print.data.frame()
errors %>%
filter(Error == "ACT -> NULL") %>%
count(Token) %>%
arrange(desc(n)) %>%
print.data.frame()
errors %>%
filter(Error == "NCOMP -> NULL") %>%
count(Token) %>%
arrange(desc(n)) %>%
print.data.frame()
errors %>%
filter(Error == "NCOMP -> NULL") %>%
count(Token) %>%
arrange(desc(n)) %>%
print.data.frame()
errors %>%
filter(Error == "NCOMP -> NONE") %>%
count(Token) %>%
arrange(desc(n)) %>%
print.data.frame()
EvalData <- readRDS(here("evaluation", "MFTE_Python_Eval_Results.rds"))
summary(EvalData)
Total number of tags manually checked
nrow(EvalData) 61140
Number of tags evaluated per file
EvalData %>%
group_by(FileID) %>%
count(.) %>%
arrange(desc(n))
Number of UNCLEAR Token
EvalData %>%
filter(TagGold %in% c("UNCLEAR")) %>%
count()
BinomCI(293, 61140,

0.2

Links

Releases

Has known vulnerabilities

© 2024 Safety CLI Cybersecurity Inc. All Rights Reserved.