Comparison Pandas with SQL Query - Python Pandas Module

Comparison Pandas with SQL Query - Python Pandas Module

Instructor-svgAl-Mamun Sarkar
Apr 01 , 2020

Comparison Pandas with SQL Query - Python Pandas Module. In the following codes I will compare Pandas with SQL Query.

 

Import Modules:

import numpy as np
import pandas as pd

 

In [2]:

tips = pd.read_csv('Data/tips.csv')
tips.head()

Out[2]:

  totalBill tip sex is_smoker tip_day tip_time size
0 16.99 1.01 Female No Sun Dinner Time 2
1 10.34 1.66 Male No Sun Dinner Time 3
2 21.01 3.50 Male No Sun Dinner Time 3
3 23.68 3.31 Male No Sun Dinner Time 2
4 24.59 3.61 Female No Sun Dinner Time 4

 

 

SQL Query:

SELECT totalBill, tip, is_smoker, tip_time FROM tips LIMIT 5;

In [3]:

tips[['totalBill', 'tip', 'is_smoker', 'tip_time']].head()

Out[3]:

  totalBill tip is_smoker tip_time
0 16.99 1.01 No Dinner Time
1 10.34 1.66 No Dinner Time
2 21.01 3.50 No Dinner Time
3 23.68 3.31 No Dinner Time
4 24.59 3.61 No Dinner Time

 

 

SQL Query:

SELECT * FROM tips WHERE tip_time = 'Dinner Time' LIMIT 5;

In [4]:

tips[tips['tip_time'] == 'Dinner Time'].head()

Out[4]:

  totalBill tip sex is_smoker tip_day tip_time size
0 16.99 1.01 Female No Sun Dinner Time 2
1 10.34 1.66 Male No Sun Dinner Time 3
2 21.01 3.50 Male No Sun Dinner Time 3
3 23.68 3.31 Male No Sun Dinner Time 2
4 24.59 3.61 Female No Sun Dinner Time 4

 

In [5]:

is_Dinner Time = tips['tip_time'] == 'Dinner Time'

In [6]:

is_Dinner Time.value_counts()

Out[6]:

True     176
False     68
Name: tip_time, dtype: int64

 

 

SQL Query:

SELECT * FROM tips WHERE tip_time = 'Dinner Time' AND tip > 5.00;

In [7]:

tips[(tips['tip_time'] == 'Dinner Time') & (tips['tip'] > 5.00 )].head(10)

Out[7]:

  totalBill tip sex is_smoker tip_day tip_time size
23 39.42 7.58 Male No Saturday Dinner Time 4
44 30.40 5.60 Male No Sun Dinner Time 4
47 32.40 6.00 Male No Sun Dinner Time 4
52 34.81 5.20 Female No Sun Dinner Time 4
59 48.27 6.73 Male No Saturday Dinner Time 4
116 29.93 5.07 Male No Sun Dinner Time 4
155 29.85 5.14 Female No Sun Dinner Time 5
170 50.81 10.00 Male Yes Saturday Dinner Time 3
172 7.25 5.15 Male Yes Sun Dinner Time 2
181 23.33 5.65 Male Yes Sun Dinner Time 2

 

 

SQL Query:

SELECT * FROM tips WHERE size >= 5 OR totalBill > 45;

In [8]:

tips[(tips['size'] >= 5) | (tips['totalBill'] > 45)]

Out[8]:

  totalBill tip sex is_smoker tip_day tip_time size
59 48.27 6.73 Male No Saturday Dinner Time 4
125 29.80 4.20 Female No Thur Lunch Time 6
141 34.30 6.70 Male No Thur Lunch Time 6
142 41.19 5.00 Male No Thur Lunch Time 5
143 27.05 5.00 Female No Thur Lunch Time 6
155 29.85 5.14 Female No Sun Dinner Time 5
156 48.17 5.00 Male No Sun Dinner Time 6
170 50.81 10.00 Male Yes Saturday Dinner Time 3
182 45.35 3.50 Male Yes Sun Dinner Time 3
185 20.69 5.00 Male No Sun Dinner Time 5
187 30.46 2.00 Male Yes Sun Dinner Time 5
212 48.33 9.00 Male No Saturday Dinner Time 4
216 28.15 3.00 Male Yes Saturday Dinner Time 5

 

In [9]:

frame = pd.DataFrame({'col1': ['A', 'B', np.NaN, 'C', 'D'],
                      'col2': ['F', np.NaN, 'G', 'H', 'I']})
frame

Out[9]:

  col1 col2
0 A F
1 B NaN
2 NaN G
3 C H
4 D I

 

 

SQL Query:

SELECT * FROM frame WHERE col2 IS NULL;

In [10]:

frame[frame['col2'].isnull()]

Out[10]:

  col1 col2
1 B NaN

 

 

SQL Query:

SELECT * FROM frame WHERE col1 IS NOT NULL;

In [11]:

frame[frame['col1'].notnull()]

Out[11]:

  col1 col2
0 A F
1 B NaN
3 C H
4 D I

 

 

SQL Query:

SELECT sex, count(*) FROM tips GROUP BY sex;

In [12]:

tips.groupby('sex').size()

Out[12]:

sex
Female     87
Male      157
dtype: int64

 

In [13]:

tips.groupby('sex').count()

Out[13]:

  totalBill tip is_smoker tip_day tip_time size
sex            
Female 87 87 87 87 87 87
Male 157 157 157 157 157 157

 

In [14]:

tips.groupby('sex')['totalBill', 'tip'].count()

Out[14]:

  totalBill tip
sex    
Female 87 87
Male 157 157

 

 

SQL Query:

SELECT tip_day, AVG(tip), COUNT(*) FROM tips GROUP BY tip_day;

In [15]:

tips.groupby('tip_day').agg({'tip':np.mean, 'tip_day': np.size})

Out[15]:

  tip tip_day
tip_day    
Friday 2.734737 19
Saturday 2.993103 87
Sun 3.255132 76
Thur 2.771452 62

 

 

SQL Query:

SELECT is_smoker, tip_day, COUNT(*), AVG(tip) FROM tips GROUP BY is_smoker, tip_day;

In [16]:

tips.groupby(['is_smoker', 'tip_day']).agg({'tip':np.mean, 'tip_day': np.size})

Out[16]:

    tip tip_day
is_smoker tip_day    
No Friday 2.812500 4
Saturday 3.102889 45
Sun 3.167895 57
Thur 2.673778 45
Yes Friday 2.714000 15
Saturday 2.875476 42
Sun 3.516842 19
Thur 3.030000 17

 

In [17]:

df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
                    'value': np.random.randn(4)})
df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'],
                    'value': np.random.randn(4)})

df1

Out[17]:

  key value
0 A -1.253014
1 B -0.593100
2 C 0.718313
3 D 0.841464

 

 

SQL Query:

SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key;

In [18]:

pd.merge(df1, df2, on='key')

Out[18]:

  key value_x value_y
0 B -0.593100 -0.080075
1 D 0.841464 1.130292
2 D 0.841464 1.458245

 

In [19]:

indexed_df2 = df2.set_index('key')
pd.merge(df1, indexed_df2, left_on='key', right_index=True)

Out[19]:

  key value_x value_y
1 B -0.593100 -0.080075
3 D 0.841464 1.130292
3 D 0.841464 1.458245

 

 

SQL Query:

SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key;

In [20]:

pd.merge(df1, df2, on='key', how='left')

Out[20]:

  key value_x value_y
0 A -1.253014 NaN
1 B -0.593100 -0.080075
2 C 0.718313 NaN
3 D 0.841464 1.130292
4 D 0.841464 1.458245

 

In [21]:

pd.merge(df1, df2, on='key', how='right')

Out[21]:

  key value_x value_y
0 B -0.593100 -0.080075
1 D 0.841464 1.130292
2 D 0.841464 1.458245
3 E NaN 1.109947

 

In [22]:

pd.merge(df1, df2, on='key', how='outer')

Out[22]:

  key value_x value_y
0 A -1.253014 NaN
1 B -0.593100 -0.080075
2 C 0.718313 NaN
3 D 0.841464 1.130292
4 D 0.841464 1.458245
5 E NaN 1.109947

 

In [23]:

df1 = pd.DataFrame({'city_name': ['Chicago', 'San Francisco', 'New York City'],
                    'rank': range(1, 4)})
df2 = pd.DataFrame({'city_name': ['Chicago', 'Boston', 'Los Angeles'],
                    'rank': [1, 4, 5]})

 

 

SQL Query:

SELECT city_name, rank FROM df1 UNION ALL SELECT city_name, rank FROM df2;

In [24]:

pd.concat([df1, df2])

Out[24]:

  city_name rank
0 Chicago 1
1 San Francisco 2
2 New York City 3
0 Chicago 1
1 Boston 4
2 Los Angeles 5

 

 

SQL Query:

SELECT city_name, rank FROM df1 UNION SELECT city_name, rank FROM df2;

In [25]:

pd.concat([df1, df2]).drop_duplicates()

Out[25]:

  city_name rank
0 Chicago 1
1 San Francisco 2
2 New York City 3
1 Boston 4
2 Los Angeles 5

 

 

SQL Query:

SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5;

In [26]:

tips.nlargest(10+5, columns='tip').tail(10)

Out[26]:

  totalBill tip sex is_smoker tip_day tip_time size
183 23.17 6.50 Male Yes Sun Dinner Time 4
214 28.17 6.50 Female Yes Saturday Dinner Time 3
47 32.40 6.00 Male No Sun Dinner Time 4
239 29.03 5.92 Male No Saturday Dinner Time 3
88 24.71 5.85 Male No Thur Lunch Time 2
181 23.33 5.65 Male Yes Sun Dinner Time 2
44 30.40 5.60 Male No Sun Dinner Time 4
52 34.81 5.20 Female No Sun Dinner Time 4
85 34.83 5.17 Female No Thur Lunch Time 4
211 25.89 5.16 Male Yes Saturday Dinner Time 4

 

 

SQL Query:

SELECT * FROM ( SELECT t.*, ROW_NUMBER() OVER(PARTITION BY tip_day ORDER BY totalBill DESC) AS rn FROM tips t ) WHERE rn < 3 ORDER BY tip_day, rn;

In [27]:

(tips.assign(rn=tips.sort_values(['totalBill'], ascending=False)
                     .groupby(['tip_day'])
                     .cumcount() + 1)
      .query('rn < 3')
      .sort_values(['tip_day','rn'])
)

Out[27]:

  totalBill tip sex is_smoker tip_day tip_time size rn
95 40.17 4.73 Male Yes Friday Dinner Time 4 1
90 28.97 3.00 Male Yes Friday Dinner Time 2 2
170 50.81 10.00 Male Yes Saturday Dinner Time 3 1
212 48.33 9.00 Male No Saturday Dinner Time 4 2
156 48.17 5.00 Male No Sun Dinner Time 6 1
182 45.35 3.50 Male Yes Sun Dinner Time 3 2
197 43.11 5.00 Female Yes Thur Lunch Time 4 1
142 41.19 5.00 Male No Thur Lunch Time 5 2

 

In [28]:

(tips.assign(rnk=tips.groupby(['tip_day'])['totalBill']
                     .rank(method='first', ascending=False))
     .query('rnk < 3')
     .sort_values(['tip_day','rnk'])
)

Out[28]:

  totalBill tip sex is_smoker tip_day tip_time size rnk
95 40.17 4.73 Male Yes Friday Dinner Time 4 1.0
90 28.97 3.00 Male Yes Friday Dinner Time 2 2.0
170 50.81 10.00 Male Yes Saturday Dinner Time 3 1.0
212 48.33 9.00 Male No Saturday Dinner Time 4 2.0
156 48.17 5.00 Male No Sun Dinner Time 6 1.0
182 45.35 3.50 Male Yes Sun Dinner Time 3 2.0
197 43.11 5.00 Female Yes Thur Lunch Time 4 1.0
142 41.19 5.00 Male No Thur Lunch Time 5 2.0

 

 

SQL Query:

SELECT * FROM ( SELECT t.*, RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk FROM tips t WHERE tip < 2 ) WHERE rnk < 3 ORDER BY sex, rnk;

In [29]:

 (tips[tips['tip'] < 2]
     .assign(rnk_min=tips.groupby(['sex'])['tip']
                         .rank(method='min'))
     .query('rnk_min < 3')
     .sort_values(['sex','rnk_min'])
)

Out[29]:

  totalBill tip sex is_smoker day tip_time size rnk_min
67 3.07 1.00 Female Yes Saturday Dinner Time 1 1.0
92 5.75 1.00 Female Yes Friday Dinner Time 2 1.0
111 7.25 1.00 Female No Saturday Dinner Time 1 1.0
236 12.60 1.00 Male Yes Saturday Dinner Time 2 1.0
237 32.83 1.17 Male Yes Saturday Dinner Time 2 2.0

 

 

SQL Query:

UPDATE tips SET tip = tip * 3 WHERE tip < 4;

In [30]:

tips.loc[tips['tip'] < 4, 'tip'] *=3

In [31]:

tips.shape

Out[31]:

(244, 7)

 

 

SQL Query:

DELETE FROM tips WHERE tip > 9;

In [32]:

tips = tips.loc[tips['tip'] <= 9]
tips.shape

Out[32]:

(243, 7)
  • Share On:
  • fb
  • twitter
  • pinterest
  • instagram