본문 바로가기

PROGRAM/Python

iris.csv 분석

 

 

 

 

In [65]:
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [17]:
df = pd.read_csv("iris.csv")
df.info()
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
In [18]:
df.head()
Out[18]:
  sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [104]:
sns.set(style="ticks", color_codes=True)
iris = sns.load_dataset("iris")
g = sns.pairplot(iris, hue="species", palette="husl")
 
In [20]:
iris.info()
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
In [21]:
df.info()
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
In [22]:
iris['species'].unique()
Out[22]:
array(['setosa', 'versicolor', 'virginica'], dtype=object)
In [23]:
df['species'].unique()
Out[23]:
array(['setosa', 'versicolor', 'virginica'], dtype=object)
In [50]:
df.describe(include='all')
Out[50]:
  sepal_length sepal_width petal_length petal_width species
count 150.000000 150.000000 150.000000 150.000000 150
unique NaN NaN NaN NaN 3
top NaN NaN NaN NaN virginica
freq NaN NaN NaN NaN 50
mean 5.843333 3.054000 3.758667 1.198667 NaN
std 0.828066 0.433594 1.764420 0.763161 NaN
min 4.300000 2.000000 1.000000 0.100000 NaN
25% 5.100000 2.800000 1.600000 0.300000 NaN
50% 5.800000 3.000000 4.350000 1.300000 NaN
75% 6.400000 3.300000 5.100000 1.800000 NaN
max 7.900000 4.400000 6.900000 2.500000 NaN
In [51]:
from pandas.api.types import is_numeric_dtype

for col in df.columns:
    if is_numeric_dtype(df[col]):
        print('%s:' % (col))
        print('\t Mean = %.2f' % df[col].mean())
        print('\t Standard deviation = %.2f' % df[col].std())
        print('\t Minimum = %.2f' % df[col].min())
        print('\t Maximum = %.2f' % df[col].max())
 
sepal_length:
	 Mean = 5.84
	 Standard deviation = 0.83
	 Minimum = 4.30
	 Maximum = 7.90
sepal_width:
	 Mean = 3.05
	 Standard deviation = 0.43
	 Minimum = 2.00
	 Maximum = 4.40
petal_length:
	 Mean = 3.76
	 Standard deviation = 1.76
	 Minimum = 1.00
	 Maximum = 6.90
petal_width:
	 Mean = 1.20
	 Standard deviation = 0.76
	 Minimum = 0.10
	 Maximum = 2.50
In [53]:
df['species'].value_counts()
Out[53]:
virginica     50
setosa        50
versicolor    50
Name: species, dtype: int64
In [54]:
print('Covariance:')
df.cov()
 
Covariance:
Out[54]:
  sepal_length sepal_width petal_length petal_width
sepal_length 0.685694 -0.039268 1.273682 0.516904
sepal_width -0.039268 0.188004 -0.321713 -0.117981
petal_length 1.273682 -0.321713 3.113179 1.296387
petal_width 0.516904 -0.117981 1.296387 0.582414
In [55]:
print('Correlation:')
df.corr()
 
Correlation:
Out[55]:
  sepal_length sepal_width petal_length petal_width
sepal_length 1.000000 -0.109369 0.871754 0.817954
sepal_width -0.109369 1.000000 -0.420516 -0.356544
petal_length 0.871754 -0.420516 1.000000 0.962757
petal_width 0.817954 -0.356544 0.962757 1.000000
 

데이터 시각화

In [74]:
%matplotlib inline

df['sepal_length'].hist(bins=8)
Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0x23469dc5208>
 
In [88]:
fig, axes = plt.subplots(2, 2, figsize=(10,6))
index = 0
bins_number=8
for ax1 in range(2):
    for ax2 in range(2):
        if index == 0 :
            axes[ax1][ax2].hist(df['sepal_length'],bins=bins_number)
        elif index == 1 :
            axes[ax1][ax2].hist(df['sepal_width'],bins=bins_number)
        elif index == 2 :
            axes[ax1][ax2].hist(df['petal_length'],bins=bins_number)
        else:
            axes[ax1][ax2].hist(df['petal_width'],bins=bins_number)
        axes[ax1][ax2].set_ylabel(df.columns[index])
        index += 1
 
In [89]:
df.boxplot()
Out[89]:
<matplotlib.axes._subplots.AxesSubplot at 0x2346fa16b88>
 
In [95]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(3, 2, figsize=(10,12))
index = 0
for i in range(3):
    for j in range(i+1,4):
        ax1 = int(index/2)
        ax2 = index % 2
        axes[ax1][ax2].scatter(df[df.columns[i]], df[df.columns[j]], color='blue')
        axes[ax1][ax2].set_xlabel(df.columns[i])
        axes[ax1][ax2].set_ylabel(df.columns[j])
        index = index + 1
 
In [98]:
from pandas.plotting import parallel_coordinates
parallel_coordinates(df, 'species')
Out[98]:
<matplotlib.axes._subplots.AxesSubplot at 0x2346a19f3c8>
 
In [101]:
from pandas.plotting import scatter_matrix
scatter_matrix(df, figsize=(8,5))
plt.show()
 
 

참조

블로그 화면크기에 맞추도록 Jupyter notebook 적용 코드가 아래에 삽입되어 있습니다.

In [106]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important; }</style>"))
 
 

'PROGRAM > Python' 카테고리의 다른 글

랜덤뽑기  (0) 2022.09.01
Class / lambda식 등  (0) 2020.09.07
jupyter notebook을 tistory에 적용하기  (0) 2020.07.30
class -sample  (0) 2020.04.29
Turtle - Example  (0) 2020.04.27