Comparison Pandas with SQL Query - Python Pandas Module. In the following codes I will compare Pandas with SQL Query.
Import Modules:
import numpy as np
import pandas as pd
In [2]:
tips = pd.read_csv('Data/tips.csv')
tips.head()
Out[2]:
totalBill | tip | sex | is_smoker | tip_day | tip_time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner Time | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner Time | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner Time | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner Time | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner Time | 4 |
SQL Query:
SELECT totalBill, tip, is_smoker, tip_time FROM tips LIMIT 5;
In [3]:
tips[['totalBill', 'tip', 'is_smoker', 'tip_time']].head()
Out[3]:
totalBill | tip | is_smoker | tip_time | |
---|---|---|---|---|
0 | 16.99 | 1.01 | No | Dinner Time |
1 | 10.34 | 1.66 | No | Dinner Time |
2 | 21.01 | 3.50 | No | Dinner Time |
3 | 23.68 | 3.31 | No | Dinner Time |
4 | 24.59 | 3.61 | No | Dinner Time |
SQL Query:
SELECT * FROM tips WHERE tip_time = 'Dinner Time' LIMIT 5;
In [4]:
tips[tips['tip_time'] == 'Dinner Time'].head()
Out[4]:
totalBill | tip | sex | is_smoker | tip_day | tip_time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner Time | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner Time | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner Time | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner Time | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner Time | 4 |
In [5]:
is_Dinner Time = tips['tip_time'] == 'Dinner Time'
In [6]:
is_Dinner Time.value_counts()
Out[6]:
True 176
False 68
Name: tip_time, dtype: int64
SQL Query:
SELECT * FROM tips WHERE tip_time = 'Dinner Time' AND tip > 5.00;
In [7]:
tips[(tips['tip_time'] == 'Dinner Time') & (tips['tip'] > 5.00 )].head(10)
Out[7]:
totalBill | tip | sex | is_smoker | tip_day | tip_time | size | |
---|---|---|---|---|---|---|---|
23 | 39.42 | 7.58 | Male | No | Saturday | Dinner Time | 4 |
44 | 30.40 | 5.60 | Male | No | Sun | Dinner Time | 4 |
47 | 32.40 | 6.00 | Male | No | Sun | Dinner Time | 4 |
52 | 34.81 | 5.20 | Female | No | Sun | Dinner Time | 4 |
59 | 48.27 | 6.73 | Male | No | Saturday | Dinner Time | 4 |
116 | 29.93 | 5.07 | Male | No | Sun | Dinner Time | 4 |
155 | 29.85 | 5.14 | Female | No | Sun | Dinner Time | 5 |
170 | 50.81 | 10.00 | Male | Yes | Saturday | Dinner Time | 3 |
172 | 7.25 | 5.15 | Male | Yes | Sun | Dinner Time | 2 |
181 | 23.33 | 5.65 | Male | Yes | Sun | Dinner Time | 2 |
SQL Query:
SELECT * FROM tips WHERE size >= 5 OR totalBill > 45;
In [8]:
tips[(tips['size'] >= 5) | (tips['totalBill'] > 45)]
Out[8]:
totalBill | tip | sex | is_smoker | tip_day | tip_time | size | |
---|---|---|---|---|---|---|---|
59 | 48.27 | 6.73 | Male | No | Saturday | Dinner Time | 4 |
125 | 29.80 | 4.20 | Female | No | Thur | Lunch Time | 6 |
141 | 34.30 | 6.70 | Male | No | Thur | Lunch Time | 6 |
142 | 41.19 | 5.00 | Male | No | Thur | Lunch Time | 5 |
143 | 27.05 | 5.00 | Female | No | Thur | Lunch Time | 6 |
155 | 29.85 | 5.14 | Female | No | Sun | Dinner Time | 5 |
156 | 48.17 | 5.00 | Male | No | Sun | Dinner Time | 6 |
170 | 50.81 | 10.00 | Male | Yes | Saturday | Dinner Time | 3 |
182 | 45.35 | 3.50 | Male | Yes | Sun | Dinner Time | 3 |
185 | 20.69 | 5.00 | Male | No | Sun | Dinner Time | 5 |
187 | 30.46 | 2.00 | Male | Yes | Sun | Dinner Time | 5 |
212 | 48.33 | 9.00 | Male | No | Saturday | Dinner Time | 4 |
216 | 28.15 | 3.00 | Male | Yes | Saturday | Dinner Time | 5 |
In [9]:
frame = pd.DataFrame({'col1': ['A', 'B', np.NaN, 'C', 'D'],
'col2': ['F', np.NaN, 'G', 'H', 'I']})
frame
Out[9]:
col1 | col2 | |
---|---|---|
0 | A | F |
1 | B | NaN |
2 | NaN | G |
3 | C | H |
4 | D | I |
SQL Query:
SELECT * FROM frame WHERE col2 IS NULL;
In [10]:
frame[frame['col2'].isnull()]
Out[10]:
col1 | col2 | |
---|---|---|
1 | B | NaN |
SQL Query:
SELECT * FROM frame WHERE col1 IS NOT NULL;
In [11]:
frame[frame['col1'].notnull()]
Out[11]:
col1 | col2 | |
---|---|---|
0 | A | F |
1 | B | NaN |
3 | C | H |
4 | D | I |
SQL Query:
SELECT sex, count(*) FROM tips GROUP BY sex;
In [12]:
tips.groupby('sex').size()
Out[12]:
sex
Female 87
Male 157
dtype: int64
In [13]:
tips.groupby('sex').count()
Out[13]:
totalBill | tip | is_smoker | tip_day | tip_time | size | |
---|---|---|---|---|---|---|
sex | ||||||
Female | 87 | 87 | 87 | 87 | 87 | 87 |
Male | 157 | 157 | 157 | 157 | 157 | 157 |
In [14]:
tips.groupby('sex')['totalBill', 'tip'].count()
Out[14]:
totalBill | tip | |
---|---|---|
sex | ||
Female | 87 | 87 |
Male | 157 | 157 |
SQL Query:
SELECT tip_day, AVG(tip), COUNT(*) FROM tips GROUP BY tip_day;
In [15]:
tips.groupby('tip_day').agg({'tip':np.mean, 'tip_day': np.size})
Out[15]:
tip | tip_day | |
---|---|---|
tip_day | ||
Friday | 2.734737 | 19 |
Saturday | 2.993103 | 87 |
Sun | 3.255132 | 76 |
Thur | 2.771452 | 62 |
SQL Query:
SELECT is_smoker, tip_day, COUNT(*), AVG(tip) FROM tips GROUP BY is_smoker, tip_day;
In [16]:
tips.groupby(['is_smoker', 'tip_day']).agg({'tip':np.mean, 'tip_day': np.size})
Out[16]:
tip | tip_day | ||
---|---|---|---|
is_smoker | tip_day | ||
No | Friday | 2.812500 | 4 |
Saturday | 3.102889 | 45 | |
Sun | 3.167895 | 57 | |
Thur | 2.673778 | 45 | |
Yes | Friday | 2.714000 | 15 |
Saturday | 2.875476 | 42 | |
Sun | 3.516842 | 19 | |
Thur | 3.030000 | 17 |
In [17]:
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
'value': np.random.randn(4)})
df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'],
'value': np.random.randn(4)})
df1
Out[17]:
key | value | |
---|---|---|
0 | A | -1.253014 |
1 | B | -0.593100 |
2 | C | 0.718313 |
3 | D | 0.841464 |
SQL Query:
SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key;
In [18]:
pd.merge(df1, df2, on='key')
Out[18]:
key | value_x | value_y | |
---|---|---|---|
0 | B | -0.593100 | -0.080075 |
1 | D | 0.841464 | 1.130292 |
2 | D | 0.841464 | 1.458245 |
In [19]:
indexed_df2 = df2.set_index('key')
pd.merge(df1, indexed_df2, left_on='key', right_index=True)
Out[19]:
key | value_x | value_y | |
---|---|---|---|
1 | B | -0.593100 | -0.080075 |
3 | D | 0.841464 | 1.130292 |
3 | D | 0.841464 | 1.458245 |
SQL Query:
SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key;
In [20]:
pd.merge(df1, df2, on='key', how='left')
Out[20]:
key | value_x | value_y | |
---|---|---|---|
0 | A | -1.253014 | NaN |
1 | B | -0.593100 | -0.080075 |
2 | C | 0.718313 | NaN |
3 | D | 0.841464 | 1.130292 |
4 | D | 0.841464 | 1.458245 |
In [21]:
pd.merge(df1, df2, on='key', how='right')
Out[21]:
key | value_x | value_y | |
---|---|---|---|
0 | B | -0.593100 | -0.080075 |
1 | D | 0.841464 | 1.130292 |
2 | D | 0.841464 | 1.458245 |
3 | E | NaN | 1.109947 |
In [22]:
pd.merge(df1, df2, on='key', how='outer')
Out[22]:
key | value_x | value_y | |
---|---|---|---|
0 | A | -1.253014 | NaN |
1 | B | -0.593100 | -0.080075 |
2 | C | 0.718313 | NaN |
3 | D | 0.841464 | 1.130292 |
4 | D | 0.841464 | 1.458245 |
5 | E | NaN | 1.109947 |
In [23]:
df1 = pd.DataFrame({'city_name': ['Chicago', 'San Francisco', 'New York City'],
'rank': range(1, 4)})
df2 = pd.DataFrame({'city_name': ['Chicago', 'Boston', 'Los Angeles'],
'rank': [1, 4, 5]})
SQL Query:
SELECT city_name, rank FROM df1 UNION ALL SELECT city_name, rank FROM df2;
In [24]:
pd.concat([df1, df2])
Out[24]:
city_name | rank | |
---|---|---|
0 | Chicago | 1 |
1 | San Francisco | 2 |
2 | New York City | 3 |
0 | Chicago | 1 |
1 | Boston | 4 |
2 | Los Angeles | 5 |
SQL Query:
SELECT city_name, rank FROM df1 UNION SELECT city_name, rank FROM df2;
In [25]:
pd.concat([df1, df2]).drop_duplicates()
Out[25]:
city_name | rank | |
---|---|---|
0 | Chicago | 1 |
1 | San Francisco | 2 |
2 | New York City | 3 |
1 | Boston | 4 |
2 | Los Angeles | 5 |
SQL Query:
SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5;
In [26]:
tips.nlargest(10+5, columns='tip').tail(10)
Out[26]:
totalBill | tip | sex | is_smoker | tip_day | tip_time | size | |
---|---|---|---|---|---|---|---|
183 | 23.17 | 6.50 | Male | Yes | Sun | Dinner Time | 4 |
214 | 28.17 | 6.50 | Female | Yes | Saturday | Dinner Time | 3 |
47 | 32.40 | 6.00 | Male | No | Sun | Dinner Time | 4 |
239 | 29.03 | 5.92 | Male | No | Saturday | Dinner Time | 3 |
88 | 24.71 | 5.85 | Male | No | Thur | Lunch Time | 2 |
181 | 23.33 | 5.65 | Male | Yes | Sun | Dinner Time | 2 |
44 | 30.40 | 5.60 | Male | No | Sun | Dinner Time | 4 |
52 | 34.81 | 5.20 | Female | No | Sun | Dinner Time | 4 |
85 | 34.83 | 5.17 | Female | No | Thur | Lunch Time | 4 |
211 | 25.89 | 5.16 | Male | Yes | Saturday | Dinner Time | 4 |
SQL Query:
SELECT * FROM ( SELECT t.*, ROW_NUMBER() OVER(PARTITION BY tip_day ORDER BY totalBill DESC) AS rn FROM tips t ) WHERE rn < 3 ORDER BY tip_day, rn;
In [27]:
(tips.assign(rn=tips.sort_values(['totalBill'], ascending=False)
.groupby(['tip_day'])
.cumcount() + 1)
.query('rn < 3')
.sort_values(['tip_day','rn'])
)
Out[27]:
totalBill | tip | sex | is_smoker | tip_day | tip_time | size | rn | |
---|---|---|---|---|---|---|---|---|
95 | 40.17 | 4.73 | Male | Yes | Friday | Dinner Time | 4 | 1 |
90 | 28.97 | 3.00 | Male | Yes | Friday | Dinner Time | 2 | 2 |
170 | 50.81 | 10.00 | Male | Yes | Saturday | Dinner Time | 3 | 1 |
212 | 48.33 | 9.00 | Male | No | Saturday | Dinner Time | 4 | 2 |
156 | 48.17 | 5.00 | Male | No | Sun | Dinner Time | 6 | 1 |
182 | 45.35 | 3.50 | Male | Yes | Sun | Dinner Time | 3 | 2 |
197 | 43.11 | 5.00 | Female | Yes | Thur | Lunch Time | 4 | 1 |
142 | 41.19 | 5.00 | Male | No | Thur | Lunch Time | 5 | 2 |
In [28]:
(tips.assign(rnk=tips.groupby(['tip_day'])['totalBill']
.rank(method='first', ascending=False))
.query('rnk < 3')
.sort_values(['tip_day','rnk'])
)
Out[28]:
totalBill | tip | sex | is_smoker | tip_day | tip_time | size | rnk | |
---|---|---|---|---|---|---|---|---|
95 | 40.17 | 4.73 | Male | Yes | Friday | Dinner Time | 4 | 1.0 |
90 | 28.97 | 3.00 | Male | Yes | Friday | Dinner Time | 2 | 2.0 |
170 | 50.81 | 10.00 | Male | Yes | Saturday | Dinner Time | 3 | 1.0 |
212 | 48.33 | 9.00 | Male | No | Saturday | Dinner Time | 4 | 2.0 |
156 | 48.17 | 5.00 | Male | No | Sun | Dinner Time | 6 | 1.0 |
182 | 45.35 | 3.50 | Male | Yes | Sun | Dinner Time | 3 | 2.0 |
197 | 43.11 | 5.00 | Female | Yes | Thur | Lunch Time | 4 | 1.0 |
142 | 41.19 | 5.00 | Male | No | Thur | Lunch Time | 5 | 2.0 |
SQL Query:
SELECT * FROM ( SELECT t.*, RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk FROM tips t WHERE tip < 2 ) WHERE rnk < 3 ORDER BY sex, rnk;
In [29]:
(tips[tips['tip'] < 2]
.assign(rnk_min=tips.groupby(['sex'])['tip']
.rank(method='min'))
.query('rnk_min < 3')
.sort_values(['sex','rnk_min'])
)
Out[29]:
totalBill | tip | sex | is_smoker | day | tip_time | size | rnk_min | |
---|---|---|---|---|---|---|---|---|
67 | 3.07 | 1.00 | Female | Yes | Saturday | Dinner Time | 1 | 1.0 |
92 | 5.75 | 1.00 | Female | Yes | Friday | Dinner Time | 2 | 1.0 |
111 | 7.25 | 1.00 | Female | No | Saturday | Dinner Time | 1 | 1.0 |
236 | 12.60 | 1.00 | Male | Yes | Saturday | Dinner Time | 2 | 1.0 |
237 | 32.83 | 1.17 | Male | Yes | Saturday | Dinner Time | 2 | 2.0 |
SQL Query:
UPDATE tips SET tip = tip * 3 WHERE tip < 4;
In [30]:
tips.loc[tips['tip'] < 4, 'tip'] *=3
In [31]:
tips.shape
Out[31]:
(244, 7)
SQL Query:
DELETE FROM tips WHERE tip > 9;
In [32]:
tips = tips.loc[tips['tip'] <= 9]
tips.shape
Out[32]:
(243, 7)