Pandas Course notebook
In [ ]:
Copied!
!pip install pandas
!pip install pandas
In [ ]:
Copied!
import pandas as pd
import numpy as np
dataset1=[1, 3, 5, np.nan, 6, 8]
df1=pd.Series(dataset1)
df1
import pandas as pd
import numpy as np
dataset1=[1, 3, 5, np.nan, 6, 8]
df1=pd.Series(dataset1)
df1
Out[ ]:
0 1.0 1 3.0 2 5.0 3 NaN 4 6.0 5 8.0 dtype: float64
In [ ]:
Copied!
import pandas as pd
dataset1=[1, 3, 5, np.nan, 6, 8]
df1=pd.Series(dataset1, index=["a","b","c","e","f","g"])
print (df1)
import pandas as pd
dataset1=[1, 3, 5, np.nan, 6, 8]
df1=pd.Series(dataset1, index=["a","b","c","e","f","g"])
print (df1)
a 1.0 b 3.0 c 5.0 e NaN f 6.0 g 8.0 dtype: float64
In [ ]:
Copied!
print (df1["a"])
print (df1["a"])
1.0
In [ ]:
Copied!
import pandas as pd
dataset1={"Vehicle number":1, "Wheels":4, "Doors":4}
df1=pd.Series(dataset1)
print (df1)
import pandas as pd
dataset1={"Vehicle number":1, "Wheels":4, "Doors":4}
df1=pd.Series(dataset1)
print (df1)
Vehicle number 1 Wheels 4 Doors 4 dtype: int64
In [ ]:
Copied!
data = {'name': ['Alice', 'Bob', 'Charlie', 'David'],
'age': [25, 32, 18, 47],
'gender': ['F', 'M', 'M', 'M']}
df = pd.DataFrame(data)
print(df)
data = {'name': ['Alice', 'Bob', 'Charlie', 'David'],
'age': [25, 32, 18, 47],
'gender': ['F', 'M', 'M', 'M']}
df = pd.DataFrame(data)
print(df)
name age gender 0 Alice 25 F 1 Bob 32 M 2 Charlie 18 M 3 David 47 M
In [ ]:
Copied!
import pandas as pd
dataset1={"Vehicle number":[1,2,3], "Wheels":[4,2,4], "Doors":[4,0,5]}
df1=pd.DataFrame(dataset1, index=["Car","Motorcycle","Van"])
print(df1)
import pandas as pd
dataset1={"Vehicle number":[1,2,3], "Wheels":[4,2,4], "Doors":[4,0,5]}
df1=pd.DataFrame(dataset1, index=["Car","Motorcycle","Van"])
print(df1)
Vehicle number Wheels Doors Car 1 4 4 Motorcycle 2 2 0 Van 3 4 5
In [ ]:
Copied!
print (df1.loc["Car"])
print (df1.loc["Car"])
Vehicle number 1 Wheels 4 Doors 4 Name: Car, dtype: int64
In [ ]:
Copied!
import pandas as pd
## Importing a CSV file
df = pd.read_csv('data.csv')
df
import pandas as pd
## Importing a CSV file
df = pd.read_csv('data.csv')
df
Out[ ]:
Duration | Pulse | Maxpulse | Calories | |
---|---|---|---|---|
0 | 60 | 110 | 130 | 409.1 |
1 | 60 | 117 | 145 | 479.0 |
2 | 60 | 103 | 135 | 340.0 |
3 | 45 | 109 | 175 | 282.4 |
4 | 45 | 117 | 148 | 406.0 |
... | ... | ... | ... | ... |
164 | 60 | 105 | 140 | 290.8 |
165 | 60 | 110 | 145 | 300.0 |
166 | 60 | 115 | 145 | 310.2 |
167 | 75 | 120 | 150 | 320.4 |
168 | 75 | 125 | 150 | 330.4 |
169 rows × 4 columns
In [ ]:
Copied!
## Export a CSV file
df.to_csv('data.csv', index=False)#The index=False argument tells Pandas not to write the row index to the CSV file.
## Export a CSV file
df.to_csv('data.csv', index=False)#The index=False argument tells Pandas not to write the row index to the CSV file.
In [ ]:
Copied!
df.head()
df.head()
Out[ ]:
Duration | Pulse | Maxpulse | Calories | |
---|---|---|---|---|
0 | 60 | 110 | 130 | 409.1 |
1 | 60 | 117 | 145 | 479.0 |
2 | 60 | 103 | 135 | 340.0 |
3 | 45 | 109 | 175 | 282.4 |
4 | 45 | 117 | 148 | 406.0 |
In [ ]:
Copied!
df.tail()
df.tail()
Out[ ]:
Duration | Pulse | Maxpulse | Calories | |
---|---|---|---|---|
164 | 60 | 105 | 140 | 290.8 |
165 | 60 | 110 | 145 | 300.0 |
166 | 60 | 115 | 145 | 310.2 |
167 | 75 | 120 | 150 | 320.4 |
168 | 75 | 125 | 150 | 330.4 |
In [ ]:
Copied!
df.info()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 169 entries, 0 to 168 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Duration 169 non-null int64 1 Pulse 169 non-null int64 2 Maxpulse 169 non-null int64 3 Calories 164 non-null float64 dtypes: float64(1), int64(3) memory usage: 5.4 KB
In [ ]:
Copied!
df.isnull().value_counts()
df.isnull().value_counts()
Out[ ]:
Duration Pulse Maxpulse Calories False False False False 164 True 5 dtype: int64
In [ ]:
Copied!
df.dropna(inplace=True)
df
df.dropna(inplace=True)
df
Out[ ]:
Duration | Pulse | Maxpulse | Calories | |
---|---|---|---|---|
0 | 60 | 110 | 130 | 409.1 |
1 | 60 | 117 | 145 | 479.0 |
2 | 60 | 103 | 135 | 340.0 |
3 | 45 | 109 | 175 | 282.4 |
4 | 45 | 117 | 148 | 406.0 |
... | ... | ... | ... | ... |
164 | 60 | 105 | 140 | 290.8 |
165 | 60 | 110 | 145 | 300.0 |
166 | 60 | 115 | 145 | 310.2 |
167 | 75 | 120 | 150 | 320.4 |
168 | 75 | 125 | 150 | 330.4 |
164 rows × 4 columns
In [ ]:
Copied!
df.isnull().value_counts()
df.isnull().value_counts()
Out[ ]:
Duration Pulse Maxpulse Calories False False False False 164 dtype: int64
In [ ]:
Copied!
import pandas as pd
df=pd.read_csv("data.csv")
df.fillna(130, inplace=True)
df["Calories"].fillna(130, inplace=True)
df
import pandas as pd
df=pd.read_csv("data.csv")
df.fillna(130, inplace=True)
df["Calories"].fillna(130, inplace=True)
df
Out[ ]:
Duration | Pulse | Maxpulse | Calories | |
---|---|---|---|---|
0 | 60 | 110 | 130 | 409.1 |
1 | 60 | 117 | 145 | 479.0 |
2 | 60 | 103 | 135 | 340.0 |
3 | 45 | 109 | 175 | 282.4 |
4 | 45 | 117 | 148 | 406.0 |
... | ... | ... | ... | ... |
164 | 60 | 105 | 140 | 290.8 |
165 | 60 | 110 | 145 | 300.0 |
166 | 60 | 115 | 145 | 310.2 |
167 | 75 | 120 | 150 | 320.4 |
168 | 75 | 125 | 150 | 330.4 |
169 rows × 4 columns
In [ ]:
Copied!
df.isnull().value_counts()
df.isnull().value_counts()
Out[ ]:
Duration Pulse Maxpulse Calories False False False False 169 dtype: int64
In [ ]:
Copied!
import pandas as pd
df=pd.read_csv("data.csv")
meancal=df["Calories"].mean()
mediancal=df["Calories"].median()
modecal=df["Calories"].mode()
print ("The mean of calories is " + str(meancal) + " The median of calories is " + str(mediancal) + " The mode of calories is " + str(modecal))
meandf=df["Calories"].fillna(meancal)
mediandf=df["Calories"].fillna(mediancal)
modedf=df["Calories"].fillna(modecal)
import pandas as pd
df=pd.read_csv("data.csv")
meancal=df["Calories"].mean()
mediancal=df["Calories"].median()
modecal=df["Calories"].mode()
print ("The mean of calories is " + str(meancal) + " The median of calories is " + str(mediancal) + " The mode of calories is " + str(modecal))
meandf=df["Calories"].fillna(meancal)
mediandf=df["Calories"].fillna(mediancal)
modedf=df["Calories"].fillna(modecal)
The mean of calories is375.79024390243904 The median of calories is 318.6 The mode of calories is 0 300.0 dtype: float64
In [ ]:
Copied!
meandf
meandf
Out[ ]:
0 409.1 1 479.0 2 340.0 3 282.4 4 406.0 ... 164 290.8 165 300.0 166 310.2 167 320.4 168 330.4 Name: Calories, Length: 169, dtype: float64
In [ ]:
Copied!
mediandf
mediandf
Out[ ]:
0 409.1 1 479.0 2 340.0 3 282.4 4 406.0 ... 164 290.8 165 300.0 166 310.2 167 320.4 168 330.4 Name: Calories, Length: 169, dtype: float64
In [ ]:
Copied!
modedf
modedf
Out[ ]:
0 409.1 1 479.0 2 340.0 3 282.4 4 406.0 ... 164 290.8 165 300.0 166 310.2 167 320.4 168 330.4 Name: Calories, Length: 169, dtype: float64
In [ ]:
Copied!
import pandas as pd
df1=pd.read_csv("data.csv")
df2=pd.read_csv("data.csv")
for x in df1.index: #replace all values in duration above 120 with 120
if df1.loc[x, "Duration"] > 120:
df1.loc[x,"Duration"] = 120
for y in df2.index: #drop all values above 120
if df2.loc[y, "Duration"] >120:
df2.drop(y, inplace = True)
import pandas as pd
df1=pd.read_csv("data.csv")
df2=pd.read_csv("data.csv")
for x in df1.index: #replace all values in duration above 120 with 120
if df1.loc[x, "Duration"] > 120:
df1.loc[x,"Duration"] = 120
for y in df2.index: #drop all values above 120
if df2.loc[y, "Duration"] >120:
df2.drop(y, inplace = True)
In [ ]:
Copied!
df1
df1
Out[ ]:
Duration | Pulse | Maxpulse | Calories | |
---|---|---|---|---|
0 | 60 | 110 | 130 | 409.1 |
1 | 60 | 117 | 145 | 479.0 |
2 | 60 | 103 | 135 | 340.0 |
3 | 45 | 109 | 175 | 282.4 |
4 | 45 | 117 | 148 | 406.0 |
... | ... | ... | ... | ... |
164 | 60 | 105 | 140 | 290.8 |
165 | 60 | 110 | 145 | 300.0 |
166 | 60 | 115 | 145 | 310.2 |
167 | 75 | 120 | 150 | 320.4 |
168 | 75 | 125 | 150 | 330.4 |
169 rows × 4 columns
In [ ]:
Copied!
df2
df2
Out[ ]:
Duration | Pulse | Maxpulse | Calories | |
---|---|---|---|---|
0 | 60 | 110 | 130 | 409.1 |
1 | 60 | 117 | 145 | 479.0 |
2 | 60 | 103 | 135 | 340.0 |
3 | 45 | 109 | 175 | 282.4 |
4 | 45 | 117 | 148 | 406.0 |
... | ... | ... | ... | ... |
164 | 60 | 105 | 140 | 290.8 |
165 | 60 | 110 | 145 | 300.0 |
166 | 60 | 115 | 145 | 310.2 |
167 | 75 | 120 | 150 | 320.4 |
168 | 75 | 125 | 150 | 330.4 |
156 rows × 4 columns
In [ ]:
Copied!
import pandas as pd
df1=pd.read_csv("data.csv")
df1.duplicated().value_counts() #search for duplicates and output true when found
import pandas as pd
df1=pd.read_csv("data.csv")
df1.duplicated().value_counts() #search for duplicates and output true when found
Out[ ]:
False 162 True 7 dtype: int64
In [ ]:
Copied!
df1.drop_duplicates(inplace = True) #drop all duplicates
df1 #find and drop duplicates example
df1.drop_duplicates(inplace = True) #drop all duplicates
df1 #find and drop duplicates example
Out[ ]:
Duration | Pulse | Maxpulse | Calories | |
---|---|---|---|---|
0 | 60 | 110 | 130 | 409.1 |
1 | 60 | 117 | 145 | 479.0 |
2 | 60 | 103 | 135 | 340.0 |
3 | 45 | 109 | 175 | 282.4 |
4 | 45 | 117 | 148 | 406.0 |
... | ... | ... | ... | ... |
164 | 60 | 105 | 140 | 290.8 |
165 | 60 | 110 | 145 | 300.0 |
166 | 60 | 115 | 145 | 310.2 |
167 | 75 | 120 | 150 | 320.4 |
168 | 75 | 125 | 150 | 330.4 |
162 rows × 4 columns
In [ ]:
Copied!
df1 = pd.DataFrame({'id': [1, 2, 3, 4], 'name': ['Alice', 'Bob', 'Charlie', 'David']})
df2 = pd.DataFrame({'id': [1, 2, 3, 4], 'age': [25, 32, 18, 47]})
df1 = pd.DataFrame({'id': [1, 2, 3, 4], 'name': ['Alice', 'Bob', 'Charlie', 'David']})
df2 = pd.DataFrame({'id': [1, 2, 3, 4], 'age': [25, 32, 18, 47]})
In [ ]:
Copied!
df1
df1
Out[ ]:
id | name | |
---|---|---|
0 | 1 | Alice |
1 | 2 | Bob |
2 | 3 | Charlie |
3 | 4 | David |
In [ ]:
Copied!
df2
df2
Out[ ]:
id | age | |
---|---|---|
0 | 1 | 25 |
1 | 2 | 32 |
2 | 3 | 18 |
3 | 4 | 47 |
In [ ]:
Copied!
merged_df = pd.merge(df1, df2, on='id') # Merge DataFrames on 'id' column
merged_df
merged_df = pd.merge(df1, df2, on='id') # Merge DataFrames on 'id' column
merged_df
Out[ ]:
id | name | age | |
---|---|---|---|
0 | 1 | Alice | 25 |
1 | 2 | Bob | 32 |
2 | 3 | Charlie | 18 |
3 | 4 | David | 47 |
In [ ]:
Copied!
df1 = pd.DataFrame({'id': [1, 2, 3, 4], 'name': ['Alice', 'Bob', 'Charlie', 'David']})
df2 = pd.DataFrame({'id': [1, 2, 3, 4], 'age': [25, 32, 18, 47]})
df1
df1 = pd.DataFrame({'id': [1, 2, 3, 4], 'name': ['Alice', 'Bob', 'Charlie', 'David']})
df2 = pd.DataFrame({'id': [1, 2, 3, 4], 'age': [25, 32, 18, 47]})
df1
Out[ ]:
id | name | |
---|---|---|
0 | 1 | Alice |
1 | 2 | Bob |
2 | 3 | Charlie |
3 | 4 | David |
In [ ]:
Copied!
df2
df2
Out[ ]:
id | age | |
---|---|---|
0 | 1 | 25 |
1 | 2 | 32 |
2 | 3 | 18 |
3 | 4 | 47 |
In [ ]:
Copied!
joined_df = df1.set_index('id').join(df2.set_index('id')) # Join DataFrames on 'id' column
joined_df
joined_df = df1.set_index('id').join(df2.set_index('id')) # Join DataFrames on 'id' column
joined_df
Out[ ]:
name | age | |
---|---|---|
id | ||
1 | Alice | 25 |
2 | Bob | 32 |
3 | Charlie | 18 |
4 | David | 47 |
In [ ]:
Copied!
In [ ]:
Copied!
import pandas as pd
df1=pd.read_csv("data.csv")
df1.describe()
import pandas as pd
df1=pd.read_csv("data.csv")
df1.describe()
Out[ ]:
Duration | Pulse | Maxpulse | Calories | |
---|---|---|---|---|
count | 169.000000 | 169.000000 | 169.000000 | 164.000000 |
mean | 63.846154 | 107.461538 | 134.047337 | 375.790244 |
std | 42.299949 | 14.510259 | 16.450434 | 266.379919 |
min | 15.000000 | 80.000000 | 100.000000 | 50.300000 |
25% | 45.000000 | 100.000000 | 124.000000 | 250.925000 |
50% | 60.000000 | 105.000000 | 131.000000 | 318.600000 |
75% | 60.000000 | 111.000000 | 141.000000 | 387.600000 |
max | 300.000000 | 159.000000 | 184.000000 | 1860.400000 |
In [ ]:
Copied!
df1.corr()
df1.corr()
Out[ ]:
Duration | Pulse | Maxpulse | Calories | |
---|---|---|---|---|
Duration | 1.000000 | -0.155408 | 0.009403 | 0.922717 |
Pulse | -0.155408 | 1.000000 | 0.786535 | 0.025121 |
Maxpulse | 0.009403 | 0.786535 | 1.000000 | 0.203813 |
Calories | 0.922717 | 0.025121 | 0.203813 | 1.000000 |
In [ ]:
Copied!