Selecting top and bottom records - Read from TXT file - Export to TXT file - Descriptive statistics with Python Pandas Module
In [1]:
import pandas as pd
from numpy import random
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
names = ['Bob','Jessica','Mary','John','Mel']
In [3]:
random.seed(500)
random_names = [names[random.randint(low=0,high=len(names))] for i in range(1000)]
random_names[:5]
Out[3]:
['Mary', 'Jessica', 'Jessica', 'Bob', 'Jessica']
In [4]:
births = [random.randint(low=1, high=1000) for i in range(1000)]
births[:5]
Out[4]:
[969, 156, 78, 579, 974]
In [5]:
BabyData = list(zip(random_names, births))
BabyData[:5]
Out[5]:
[('Mary', 969),
('Jessica', 156),
('Jessica', 78),
('Bob', 579),
('Jessica', 974)]
In [6]:
df = pd.DataFrame(data=BabyData, columns=['Names', 'Births'])
In [7]:
df[:5]
Out[7]:
Names | Births | |
---|---|---|
0 | Mary | 969 |
1 | Jessica | 156 |
2 | Jessica | 78 |
3 | Bob | 579 |
4 | Jessica | 974 |
In [8]:
df.to_csv('Data/births1880.csv', index=False, header=False)
In [9]:
df = pd.read_csv('Data/births1880.csv')
df.info()
Out[9]:
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
Mary 999 non-null object
969 999 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.7+ KB
In [10]:
df.head()
Out[10]:
Mary | 969 | |
---|---|---|
0 | Jessica | 156 |
1 | Jessica | 78 |
2 | Bob | 579 |
3 | Jessica | 974 |
4 | Jessica | 125 |
In [11]:
df = pd.read_csv('Data/births1880.csv', header=None)
df.info()
Out[11]:
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
0 1000 non-null object
1 1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.7+ KB
In [12]:
df.tail()
Out[12]:
0 | 1 | |
---|---|---|
995 | John | 152 |
996 | Jessica | 512 |
997 | John | 757 |
998 | Jessica | 295 |
999 | John | 153 |
In [13]:
df = pd.read_csv('Data/births1880.csv', names=['Names','Births'])
df.head(5)
Out[13]:
Names | Births | |
---|---|---|
0 | Mary | 969 |
1 | Jessica | 156 |
2 | Jessica | 78 |
3 | Bob | 579 |
4 | Jessica | 974 |
In [14]:
df['Names'].unique()
Out[14]:
array(['Mary', 'Jessica', 'Bob', 'John', 'Mel'], dtype=object)
In [15]:
for x in df['Names'].unique():
print(x)
Out[15]:
Mary
Jessica
Bob
John
Mel
In [16]:
print(df['Names'].describe())
Out[16]:
count 1000
unique 5
top Bob
freq 206
Name: Names, dtype: object
In [17]:
name = df.groupby('Names')
df = name.sum()
df
Out[18]:
Births | |
---|---|
Names | |
Bob | 107023 |
Jessica | 98024 |
John | 90899 |
Mary | 99636 |
Mel | 102523 |
In [19]:
Sorted = df.sort_values(['Births'], ascending=False)
Sorted.head(1)
Out[19]:
Births | |
---|---|
Names | |
Bob | 107023 |
In [20]:
df['Births'].max()
Out[20]:
107023
In [21]:
df['Births'].plot.bar()
print("The most popular name")
df.sort_values(by='Births', ascending=False)
Out[21]:
Births | |
---|---|
Names | |
Bob | 107023 |
Mel | 102523 |
Mary | 99636 |
Jessica | 98024 |
John | 90899 |