However, (at least for me), the Vim user base is still a mistery. Don't get me wrong, ressources such as the Vim Subreddit, and the VimConf youtube videos are great.
But I think that with a "data-driven" approach, we might actually find some interesting things about the VimUniverse!
Taking the Stack Overflow Developer Survey from 2019, I will try to answer some questions about the VIM community:
Notes:
Let's start by importing some libraries and the data.
import pandas
import pathlib
import matplotlib.pyplot as plt
import numpy
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.transform import jitter
output_notebook()
We load the data in, and print some example rows from the survey.
to_print = 15
interesting_cols = ["DevEnviron", "Country", "Age", "ConvertedComp"]
# fetch the data from the folder
data_folder = pathlib.Path("data")
survey_file = data_folder /"2019" / "survey_results_public.csv"
# load it into pandas
raw_data = pandas.read_csv(survey_file)
# print the dataframe
raw_data[interesting_cols].sample(to_print)
Looks alright. Let's get started.
# define constraints
editor_in_focus = "Vim"
columns_of_interest = ["Country", "DevEnviron"]
min_respondents_per_country = 50
color_main = "CornFlowerBlue"
color_sec = "Crimson"
figure_thres = 30
# create a dataframe only with VIM users
country_respondents = raw_data.Country.value_counts()
data = raw_data[raw_data.Country.isin(country_respondents.index[country_respondents.gt(min_respondents_per_country)])]
data = data.dropna(subset=columns_of_interest)
data = data[data.DevEnviron.str.contains(editor_in_focus)]
# convert the dataframe to arrays for bokeh
data_to_plot = (data.Country.value_counts() / raw_data.Country.value_counts()).dropna()
countries = data_to_plot.index.to_list()
usage = [v * 100 for v in data_to_plot.values]
sorted_countries = sorted(countries, key=lambda x: usage[countries.index(x)])
# create bokeh figure
tooltips = [("Country", "@x"), ("Percentage", "@top{1.11}%")]
fill_color = [color_main if usage[i] < figure_thres else color_sec for i in range(len(usage))]
p = figure(x_range=countries, tooltips=tooltips, plot_width=1000, title=f"Percentage of survey respondents that are regular {editor_in_focus} users.")
p.vbar(x=countries, top=usage, width=0.9, color=fill_color)
p.xgrid.grid_line_color = "white"
p.y_range.start = 0
p.xaxis.major_label_orientation = "vertical"
show(p)
# print informative percentages
perc = round(data.shape[0] / raw_data.shape[0] * 100, 2)
print(f"In {raw_data.shape[0]} respondents, {data.shape[0]} use {editor_in_focus} regularly. (or {perc}%)")
Interesting, the country where the percentage of respondents that are regular users of VIM is higher is Paraguay π΅πΎ, where 44%(!) of respondents are regular VIM users!
Keep in mind that paraguay has a total of 52 ppl responding to the survey, so we might wonder if that is representative.
Here are the ones that follow:
I'm particularly interested on why this might be the case, particularly for Paraguay, Switzerland and South Korea..
# contraints and variables
editor_in_focus = "Vim"
gender_of_interest = "Woman"
columns_of_interest = ["Country", "DevEnviron"]
min_respondents_per_country = 50
color_main = "LightPink"
color_sec = "Plum"
thres = 3
# data only with female respondents
country_respondents = raw_data.Country.value_counts()
data = raw_data[raw_data.Country.isin(country_respondents.index[country_respondents.gt(min_respondents_per_country)])]
data = data.dropna(subset=columns_of_interest)
data = data[data.Gender == gender_of_interest]
data = data[data.DevEnviron.str.contains(editor_in_focus)]
# convert data for bokeh
data_to_plot = (data.Country.value_counts() / raw_data.Country.value_counts()).dropna()
countries = data_to_plot.index.to_list()
usage = [v * 100 for v in data_to_plot.values]
sorted_countries = sorted(countries, key=lambda x: usage[countries.index(x)])
# create bokeh figure
tooltips = [("Country", "@x"), ("Percentage", "@top{1.11}%")]
fill_color = [color_main if usage[i] < thres else color_sec for i in range(len(usage))]
p = figure(x_range=countries, tooltips=tooltips, plot_width=1000, title=f"Percentage of females that are regular {editor_in_focus} users per country.")
p.vbar(x=countries, top=usage, width=0.9, color=fill_color)
p.xgrid.grid_line_color = "white"
p.y_range.start = 0
p.xaxis.major_label_orientation = "vertical"
show(p)
# print informative percentage
perc = round(data.shape[0] / raw_data.shape[0] * 100, 2)
print(f"In {raw_data.shape[0]} respondents, {data.shape[0]} respondents of gender {gender_of_interest} use {editor_in_focus} regularly. (or {perc}%)")
Oh! South Korea π°π· appears to pop up here again, along with Paraguay. But it appears that female VIM users are on average about 1% of the respondents of each country..
# constraints
ide = "Vim"
color = "green"
column_of_interest = "Age"
columns_of_interest = ["DevEnviron"]
data = raw_data.dropna(subset=columns_of_interest)
editors = list(set([item for sublist in raw_data.DevEnviron.str.split(";").dropna().tolist() for item in sublist]))
editor_dict = {}
# build data
for editor in editors:
if editor == "Notepad++":
editor_string = "Notepad"
else:
editor_string = editor
editor_dict[editor] = data[data.DevEnviron.str.contains(editor_string)].dropna()[column_of_interest].values.mean()
# prepare data for graph
right = list(editor_dict.values())
y = list(editor_dict.keys())
sorted_y = sorted(y, key=lambda x: right[y.index(x)])
# plot the graph
fill_color = [color if y[i] == ide else "lightgrey" for i in range(len(right))]
tooltips = [("IDE", "@y"), (column_of_interest, "@right{1.1}")]
p = figure(plot_height=500, plot_width=900, y_range=sorted_y, title=f"Average age of IDE users", tooltips=tooltips)
p.hbar(y=y, height=0.5, left=0, right=right, color=fill_color,)
p.x_range.start = 29
p.x_range.end = 34
show(p)
Some interesting observations in this graph:
Emacs
, has the 5th oldest user base at 31.7 years old. Vim
in the other hand, has a relatively young user base, sitting at roughly 30 years old. # constraints
ide = "Vim"
color = "green"
column_of_interest = "ConvertedComp"
columns_of_interest = ["DevEnviron"]
data = raw_data.dropna(subset=columns_of_interest)
editors = list(set([item for sublist in raw_data.DevEnviron.str.split(";").dropna().tolist() for item in sublist]))
editor_dict = {}
# build data source
for editor in editors:
if editor == "Notepad++":
editor_string = "Notepad"
else:
editor_string = editor
editor_dict[editor] = data[data.DevEnviron.str.contains(editor_string)].dropna()[column_of_interest].values.mean()
# prepare data for graph
right = list(editor_dict.values())
y = list(editor_dict.keys())
sorted_y = sorted(y, key=lambda x: right[y.index(x)])
# plot
fill_color = [color if y[i] == ide else "lightgrey" for i in range(len(right))]
tooltips = [("IDE", "@y"), (column_of_interest, "@right{1.1}")]
p = figure(plot_height=500, plot_width=900, y_range=sorted_y, title=f"Average yearly salary in USD of IDE users", tooltips=tooltips)
p.hbar(y=y, height=0.5, left=0, right=right, color=fill_color,)
p.below[0].formatter.use_scientific = False
p.x_range.start = 0
show(p)
VIM users earn on average 155k USD per year. This makes them the 4th most well payed "IDE users".
Moreover, the highest payed developpers are more likely to use Emacs
on a regular basis!
Well, this was a fun exercise, along the way we discovered some interesting things about VIM users:
Questions? Ideas for improvements? Just contact me!