.menu>li.statistics-on>a {border-color:#444;cursor: default;}In the Python code import pandas as pd
has been run
Operation |
STATA |
Pandas |
Base R |
Create new dataset from values |
input a b
1 4
2 5
3 6
end
|
d = {'a' : [1,2,3], 'b' : [4,5,6]}
df = pd.DataFrame(d)
|
df <- data.frame(a=1:3, b=4:6)
|
Create new dataset from csv file |
import delim mydata.csv, delimiters(",")
|
df = pd.read_csv('mydata.csv', sep=',')
|
df <- read.csv('my_data.csv', sep=',')
|
Print observations |
|
|
|
Print observations of variable x |
|
|
|
Select only variable x |
|
|
|
Select only variables x and y |
|
|
|
Drop variable x |
|
df = df.drop('x', axis=1)
|
|
Generate new variable |
|
df['z'] = df['x'] + df['y']
|
|
Rename variable |
|
df.rename(columns = {'x' : 'y'})
|
names(df)[names(df) == ‘x’] <- ‘y’
|
Sort by variable |
|
|
|
Operation |
STATA |
Pandas |
Base R |
Conditionally print observations |
|
|
|
Conditionally print observations with 'or' operator |
|
df[(df['x'] > 1) | (df['y'] < 0)]
|
subset(df, x == 1 | y < 0)
|
Conditionally print observations with 'and' operator |
|
df[(df['x'] > 1) & (df['y'] < 0)]
|
subset(df, x == 1 & y < 0)
|
Print subset of observations based on location |
|
|
|
Print observations with missing values in x |
|
|
|
Description |
STATA |
Pandas |
Base R |
Print summary statistics |
|
|
|
Print information about variables and data types |
|
|
|
Print aggregation of variable |
|
|
|
Group data by variable and summarize |
|
df.groupby('x').describe()
|
aggregate(. ~ x, df, summary)
|
Print frequency table |
|
|
|
Print cross-tabulation |
|
pd.crosstab(df['x'], df['y'])
|
|
Create bins based
on values in x in new column 'bins' |
egen bins = cut x, group(3)
|
df['bins'] = pd.cut(df['x'], 3)
|
|
Operation |
STATA |
Pandas |
Base R |
Reshape data from wide to long panel |
reshape long x, i(i) j(j)
|
pd.wide_to_long(df, ['x'], i='i', j='j')
|
reshape(df, direction='long', varying=grep('j', names(df), value=TRUE), sep='')
|
Reshape data from long to wide panel |
|
df.unstack()
# returns hierarchical columns
|
reshape(df, timevar='x', idvar='i', direction='wide')
|
Operation |
STATA |
Pandas |
Base R |
Vertically concatenate datasets |
|
|
rbind(x, y)
# note that columns must be the same for each dataset
|
Merge datasets on key |
|
pd.merge(x, y, on='key', how='inner')
|
|
Operation |
STATA |
Pandas |
Base R |
Scatter plot |
|
df.plot.scatter('x', 'y')
|
|
Line plot |
|
|
|
Histogram |
|
|
|
Boxplot |
|
|
|