MNAR Multivariate Examples
MNAR mechanism¶
This Jupyter Notebook provides various strategies to generate artificial missing data under MNAR mechanism in multivariate scenario. Our example is using the Breast Cancer Wiscosin dataset from Scikit-learn.
The approaches covered in this Jupyter Notebook are:
- Random
- Correlated
- Missigness Based on Own Values (MBOV) using a randomess to choose miss locations in each feature
- Missingness Based on Intra-Relation (MBIR)
In [3]:
Copied!
# Import the libraries
import pandas as pd
from sklearn.datasets import load_breast_cancer
from mdatagen.multivariate.mMNAR import mMNAR
# Load the data
wiscosin = load_breast_cancer()
wiscosin_df = pd.DataFrame(data=wiscosin.data, columns=wiscosin.feature_names)
X = wiscosin_df.copy() # Features
y = wiscosin.target # Label values
# Import the libraries
import pandas as pd
from sklearn.datasets import load_breast_cancer
from mdatagen.multivariate.mMNAR import mMNAR
# Load the data
wiscosin = load_breast_cancer()
wiscosin_df = pd.DataFrame(data=wiscosin.data, columns=wiscosin.feature_names)
X = wiscosin_df.copy() # Features
y = wiscosin.target # Label values
- Random
In [22]:
Copied!
# Create a instance for MNAR mechanism
generator = mMNAR(X=X,
y=y,
n_xmiss = 8, # 8 feature that will receive the missingness randomly
threshold = 1) # highest values
# Generate the missing data under MNAR mechanism up to 20% missing rate
generate_data = generator.random(missing_rate=20,
deterministic=True) # Missingness based on own values
qtd_miss = sum(generate_data.isna().sum())
print(f"Global Missing rate = {round(qtd_miss/(generate_data.shape[0]*generate_data.shape[1]),2)}")
print("\n")
print("Columns that receive the missingness:")
for col in generate_data.columns:
if generate_data[col].isna().sum() > 0:
print(col)
else:
pass
# Create a instance for MNAR mechanism
generator = mMNAR(X=X,
y=y,
n_xmiss = 8, # 8 feature that will receive the missingness randomly
threshold = 1) # highest values
# Generate the missing data under MNAR mechanism up to 20% missing rate
generate_data = generator.random(missing_rate=20,
deterministic=True) # Missingness based on own values
qtd_miss = sum(generate_data.isna().sum())
print(f"Global Missing rate = {round(qtd_miss/(generate_data.shape[0]*generate_data.shape[1]),2)}")
print("\n")
print("Columns that receive the missingness:")
for col in generate_data.columns:
if generate_data[col].isna().sum() > 0:
print(col)
else:
pass
Global Missing rate = 0.19 Columns that receive the missingness mean area mean smoothness mean concavity mean symmetry texture error perimeter error worst texture worst area
- Correlated
In [24]:
Copied!
# Create a instance for MNAR mechanism
generator = mMNAR(X=X,
y=y,
threshold = 1) # highest values
# Generate the missing data under MNAR mechanism up to 20% missing rate
generate_data = generator.correlated(missing_rate=20,
deterministic=True) # Missingness based on own values
qtd_miss = sum(generate_data.isna().sum())
print(f"Global Missing rate = {round(qtd_miss/(generate_data.shape[0]*generate_data.shape[1]),2)}")
print("\n")
print("Columns that receive the missingness:")
for col in generate_data.columns:
if generate_data[col].isna().sum() > 0:
print(col)
else:
pass
# Create a instance for MNAR mechanism
generator = mMNAR(X=X,
y=y,
threshold = 1) # highest values
# Generate the missing data under MNAR mechanism up to 20% missing rate
generate_data = generator.correlated(missing_rate=20,
deterministic=True) # Missingness based on own values
qtd_miss = sum(generate_data.isna().sum())
print(f"Global Missing rate = {round(qtd_miss/(generate_data.shape[0]*generate_data.shape[1]),2)}")
print("\n")
print("Columns that receive the missingness:")
for col in generate_data.columns:
if generate_data[col].isna().sum() > 0:
print(col)
else:
pass
Global Missing rate = 0.19 Columns that receive the missingness: mean perimeter mean concavity radius error texture error compactness error fractal dimension error worst radius worst texture worst perimeter worst area worst smoothness worst concavity worst concave points worst symmetry worst fractal dimension
- Missigness Based on Own Values (MBOV) using a randomess to choose miss locations in each feature -> MBOV_randomness
In [37]:
Copied!
# Create a instance for MNAR mechanism
generator = mMNAR(X=X,
y=y,
)
list_to_gen_miss = ["mean radius","texture error", "area error"]
# Generate the missing data under MNAR mechanism up to 20% missing rate
generate_data = generator.MBOV_randomness(missing_rate=20,
columns=list_to_gen_miss,
randomness=0.3)
qtd_miss = sum(generate_data.isna().sum())
print(f"Global Missing rate = {round(qtd_miss / (len(list_to_gen_miss) * generate_data.shape[0]),2)}")
print("\n")
print("Columns that receive the missingness:")
for col in generate_data.columns:
if generate_data[col].isna().sum() > 0:
print(col)
else:
pass
# Create a instance for MNAR mechanism
generator = mMNAR(X=X,
y=y,
)
list_to_gen_miss = ["mean radius","texture error", "area error"]
# Generate the missing data under MNAR mechanism up to 20% missing rate
generate_data = generator.MBOV_randomness(missing_rate=20,
columns=list_to_gen_miss,
randomness=0.3)
qtd_miss = sum(generate_data.isna().sum())
print(f"Global Missing rate = {round(qtd_miss / (len(list_to_gen_miss) * generate_data.shape[0]),2)}")
print("\n")
print("Columns that receive the missingness:")
for col in generate_data.columns:
if generate_data[col].isna().sum() > 0:
print(col)
else:
pass
Global Missing rate = 0.19 Columns that receive the missingness: mean radius texture error area error
- Missingness Based on Intra-Relation (MBIR)
In [38]:
Copied!
# Create a instance for MNAR mechanism
generator = mMNAR(X=X,
y=y,
)
list_to_gen_miss = ["mean radius","texture error", "area error"]
# Generate the missing data under MNAR mechanism up to 20% missing rate
generate_data = generator.MBIR(missing_rate=20,
columns=list_to_gen_miss,
statistical_method="Mann-Whitney")
qtd_miss = sum(generate_data.isna().sum())
print(f"Global Missing rate = {round(qtd_miss / (len(list_to_gen_miss) * generate_data.shape[0]),2)}")
print("\n")
print("Columns that receive the missingness:")
for col in generate_data.columns:
if generate_data[col].isna().sum() > 0:
print(col)
else:
pass
# Create a instance for MNAR mechanism
generator = mMNAR(X=X,
y=y,
)
list_to_gen_miss = ["mean radius","texture error", "area error"]
# Generate the missing data under MNAR mechanism up to 20% missing rate
generate_data = generator.MBIR(missing_rate=20,
columns=list_to_gen_miss,
statistical_method="Mann-Whitney")
qtd_miss = sum(generate_data.isna().sum())
print(f"Global Missing rate = {round(qtd_miss / (len(list_to_gen_miss) * generate_data.shape[0]),2)}")
print("\n")
print("Columns that receive the missingness:")
for col in generate_data.columns:
if generate_data[col].isna().sum() > 0:
print(col)
else:
pass
Global Missing rate = 0.2 Columns that receive the missingness: mean radius texture error area error