Visualization Plots
This Jupyter notebook provide some examples of how to use the Visualization module from mdatagen. The first step is get Iris dataset from Scikit-Learn. Afterward, we will generate missing values as following:
In [31]:
Copied!
# Get iris dataset and insert MV in 2 columns
import pandas as pd
from sklearn.datasets import load_iris
import numpy as np
from typing import Optional
from mdatagen.plots import PlotMissingData
def create_iris_missing(mrate, col_1:str, col_2:Optional[str]=None):
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
#iris_df['target'] = iris.target
iris_miss = iris_df.copy()
num_missing = int(mrate * len(iris_miss))
missing_indices = np.random.choice(iris_miss.index, size=num_missing, replace=False)
iris_miss.loc[missing_indices, col_1] = np.nan
if col_2:
missing_indices = np.random.choice(iris_miss.index, size=num_missing, replace=False)
iris_miss.loc[missing_indices, col_2] = np.nan
return iris_df, iris_miss
# Get iris dataset and insert MV in 2 columns
import pandas as pd
from sklearn.datasets import load_iris
import numpy as np
from typing import Optional
from mdatagen.plots import PlotMissingData
def create_iris_missing(mrate, col_1:str, col_2:Optional[str]=None):
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
#iris_df['target'] = iris.target
iris_miss = iris_df.copy()
num_missing = int(mrate * len(iris_miss))
missing_indices = np.random.choice(iris_miss.index, size=num_missing, replace=False)
iris_miss.loc[missing_indices, col_1] = np.nan
if col_2:
missing_indices = np.random.choice(iris_miss.index, size=num_missing, replace=False)
iris_miss.loc[missing_indices, col_2] = np.nan
return iris_df, iris_miss
Only 1 feature with missing values¶
In [32]:
Copied!
mr = 0.4
col_1 = "petal length (cm)"
col_2 = "sepal width (cm)"
cols = (col_1, col_2)
iris_df, iris_miss = create_iris_missing(mr, col_1="petal length (cm)")
mr = 0.4
col_1 = "petal length (cm)"
col_2 = "sepal width (cm)"
cols = (col_1, col_2)
iris_df, iris_miss = create_iris_missing(mr, col_1="petal length (cm)")
In [4]:
Copied!
iris_df
iris_df
Out[4]:
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 |
146 | 6.3 | 2.5 | 5.0 | 1.9 |
147 | 6.5 | 3.0 | 5.2 | 2.0 |
148 | 6.2 | 3.4 | 5.4 | 2.3 |
149 | 5.9 | 3.0 | 5.1 | 1.8 |
150 rows × 4 columns
In [5]:
Copied!
iris_miss
iris_miss
Out[5]:
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | NaN | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | NaN | 0.2 |
... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | NaN | 2.3 |
146 | 6.3 | 2.5 | 5.0 | 1.9 |
147 | 6.5 | 3.0 | NaN | 2.0 |
148 | 6.2 | 3.4 | 5.4 | 2.3 |
149 | 5.9 | 3.0 | 5.1 | 1.8 |
150 rows × 4 columns
In [ ]:
Copied!
miss_plot = PlotMissingData(data_missing=iris_miss, data_original=iris_df)
miss_plot = PlotMissingData(data_missing=iris_miss, data_original=iris_df)
In [7]:
Copied!
miss_plot.visualize_miss("normal")
miss_plot.visualize_miss("normal")
In [8]:
Copied!
miss_plot.visualize_miss("bar")
miss_plot.visualize_miss("bar")
In [9]:
Copied!
miss_plot.visualize_miss("dendrogram")
miss_plot.visualize_miss("dendrogram")
In [10]:
Copied!
miss_plot.visualize_miss("heatmap")
miss_plot.visualize_miss("heatmap")
In [11]:
Copied!
miss_plot.visualize_miss(visualization_type="histogram", col_missing=col_1, num_bins=5)
miss_plot.visualize_miss(visualization_type="histogram", col_missing=col_1, num_bins=5)
In [12]:
Copied!
miss_plot.visualize_miss(visualization_type="boxplot", col_missing=col_1)
miss_plot.visualize_miss(visualization_type="boxplot", col_missing=col_1)
In [13]:
Copied!
miss_plot.visualize_miss(visualization_type="scatterplot", cols=(col_1, col_2))
miss_plot.visualize_miss(visualization_type="scatterplot", cols=(col_1, col_2))
We can also change the axis of the plot (x or y for missing):
In [14]:
Copied!
miss_plot.visualize_miss(visualization_type="scatterplot", cols=(col_2, col_1))
miss_plot.visualize_miss(visualization_type="scatterplot", cols=(col_2, col_1))
Two features with missing values¶
In [33]:
Copied!
_, iris_miss_2d = create_iris_missing(mr, col_1, col_2)
_, iris_miss_2d = create_iris_missing(mr, col_1, col_2)
In [34]:
Copied!
iris_miss_2d
iris_miss_2d
Out[34]:
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | NaN | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | NaN | 1.5 | 0.2 |
4 | 5.0 | NaN | 1.4 | 0.2 |
... | ... | ... | ... | ... |
145 | 6.7 | NaN | 5.2 | 2.3 |
146 | 6.3 | 2.5 | NaN | 1.9 |
147 | 6.5 | 3.0 | 5.2 | 2.0 |
148 | 6.2 | 3.4 | NaN | 2.3 |
149 | 5.9 | NaN | 5.1 | 1.8 |
150 rows × 4 columns
In [35]:
Copied!
miss_plot_2d = PlotMissingData(data_missing=iris_miss_2d, data_original=iris_df)
miss_plot_2d = PlotMissingData(data_missing=iris_miss_2d, data_original=iris_df)
In [36]:
Copied!
miss_plot_2d.visualize_miss("normal")
miss_plot_2d.visualize_miss("normal")
In [37]:
Copied!
miss_plot_2d.visualize_miss("bar")
miss_plot_2d.visualize_miss("bar")
In [38]:
Copied!
miss_plot_2d.visualize_miss("dendrogram")
miss_plot_2d.visualize_miss("dendrogram")
In [39]:
Copied!
miss_plot_2d.visualize_miss("heatmap")
miss_plot_2d.visualize_miss("heatmap")
In [40]:
Copied!
miss_plot_2d.visualize_miss(visualization_type="histogram", col_missing=col_1, num_bins=5)
miss_plot_2d.visualize_miss(visualization_type="histogram", col_missing=col_1, num_bins=5)
In [41]:
Copied!
miss_plot_2d.visualize_miss(visualization_type="histogram", col_missing=col_2, num_bins=5)
miss_plot_2d.visualize_miss(visualization_type="histogram", col_missing=col_2, num_bins=5)
In [42]:
Copied!
miss_plot_2d.visualize_miss(visualization_type="boxplot", col_missing=col_1)
miss_plot_2d.visualize_miss(visualization_type="boxplot", col_missing=col_1)
In [43]:
Copied!
miss_plot_2d.visualize_miss(visualization_type="boxplot", col_missing=col_2)
miss_plot_2d.visualize_miss(visualization_type="boxplot", col_missing=col_2)
In [44]:
Copied!
miss_plot_2d.visualize_miss(visualization_type="scatterplot", cols=(col_1, col_2))
miss_plot_2d.visualize_miss(visualization_type="scatterplot", cols=(col_1, col_2))
In [201]:
Copied!
iris_miss
iris_miss
Out[201]:
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
---|---|---|---|---|
0 | 5.1 | 3.5 | NaN | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | NaN | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | NaN | 2.3 |
146 | 6.3 | 2.5 | 5.0 | 1.9 |
147 | 6.5 | 3.0 | 5.2 | 2.0 |
148 | 6.2 | 3.4 | NaN | 2.3 |
149 | 5.9 | 3.0 | NaN | 1.8 |
150 rows × 4 columns