Load packages

library(dplyr)
library(ggplot2)
library(magrittr)
library(readr)

Load data

en <- read_tsv("en.tsv") %>%
    mutate(Language = "English", Proportion = Count / sum(Count))
fr <- read_tsv("fr.tsv") %>%
    mutate(Language = "French", Proportion = Count / sum(Count))
data <- rbind(en, fr)
glimpse(data)
## Observations: 50
## Variables: 4
## $ Length     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ Count      <int> 1, 140, 853, 3130, 6919, 11492, 16882, 19461, 16693...
## $ Language   <chr> "English", "English", "English", "English", "Englis...
## $ Proportion <dbl> 9.125586e-06, 1.277582e-03, 7.784125e-03, 2.856309e...

Plot word length frequency

ggplot(data) +
    aes(x = Length, y = Proportion, colour = Language) +
    geom_point() + geom_line()

Perform KS test and Wilcoxan test

en_length <- en %$% rep.int(Length, Count)
fr_length <- fr %$% rep.int(Length, Count)
ks.test(en_length, fr_length)
## Warning in ks.test(en_length, fr_length): p-value will be approximate in
## the presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  en_length and fr_length
## D = 0.33232, p-value < 2.2e-16
## alternative hypothesis: two-sided
wilcox.test(en_length, fr_length)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  en_length and fr_length
## W = 1.0546e+10, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
diff_median <- median(fr_length) - median(en_length)

The difference between the median word length of English and French is 2.