import pandas as pd
import scipy
from datetime import date
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve
import seaborn as sns


# df is the dataframe that will hold all the crime data from the csv file
df = pd.read_csv("Crime_Data_from_2020_to_Present.csv")
# display the dataframe so we can actually see the tabular format of the data rather than the previous comma separated format
df.head()


df.shape

(938457, 28)


df.columns

Index(['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME',
       'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes',
       'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc',
       'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1',
       'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT',
       'LON'],
      dtype='object')


df.dtypes

DR_NO               int64
Date Rptd          object
DATE OCC           object
TIME OCC            int64
AREA                int64
AREA NAME          object
Rpt Dist No         int64
Part 1-2            int64
Crm Cd              int64
Crm Cd Desc        object
Mocodes            object
Vict Age            int64
Vict Sex           object
Vict Descent       object
Premis Cd         float64
Premis Desc        object
Weapon Used Cd    float64
Weapon Desc        object
Status             object
Status Desc        object
Crm Cd 1          float64
Crm Cd 2          float64
Crm Cd 3          float64
Crm Cd 4          float64
LOCATION           object
Cross Street       object
LAT               float64
LON               float64
dtype: object


# Let's convert the 'DATE OCC' to datetime format
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'])
# Now let's filter the data and only keep the crime data which occured in 2024
df = df[df['DATE OCC'].dt.year == 2024]
# Let's also convert the 'Date Rptd' to datetime format
# note: we did this conversion after filtering the data so that we could save
#       processing time since we don't need to convert rows for data we don't end up using
df['Date Rptd'] = pd.to_datetime(df['Date Rptd'])
df.head()


df['TIME OCC'] = df['TIME OCC'].apply(lambda x: str(x).zfill(4))
df['TIME OCC'] = pd.to_datetime(df['TIME OCC'], format = '%H%M').dt.strftime('%H:%M')
df.head()


df.count()

DR_NO             62612
Date Rptd         62612
DATE OCC          62612
TIME OCC          62612
AREA              62612
AREA NAME         62612
Rpt Dist No       62612
Part 1-2          62612
Crm Cd            62612
Crm Cd Desc       62612
Mocodes           52184
Vict Age          62612
Vict Sex          52461
Vict Descent      52459
Premis Cd         62612
Premis Desc       62581
Weapon Used Cd    19816
Weapon Desc       19816
Status            62612
Status Desc       62612
Crm Cd 1          62612
Crm Cd 2           4000
Crm Cd 3            109
Crm Cd 4              3
LOCATION          62612
Cross Street       8421
LAT               62612
LON               62612
dtype: int64


df.drop(['Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4'], axis = 1, inplace=True)
display(df.head())
print(df.columns)

Index(['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME',
       'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes',
       'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc',
       'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'LOCATION',
       'Cross Street', 'LAT', 'LON'],
      dtype='object')


df.drop(['Crm Cd Desc', 'Premis Desc', 'Weapon Desc', 'Status Desc'], axis = 1, inplace=True)
display(df.head())
print(df.columns)

Index(['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME',
       'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Mocodes', 'Vict Age', 'Vict Sex',
       'Vict Descent', 'Premis Cd', 'Weapon Used Cd', 'Status', 'LOCATION',
       'Cross Street', 'LAT', 'LON'],
      dtype='object')


df = df.drop(columns = ['LOCATION', 'Cross Street', 'Mocodes'])


df = df.dropna(subset=['Weapon Used Cd'])


df['Vict Sex'].unique()

array(['F', 'M', 'X', nan], dtype=object)


df = df[df['Vict Sex'].isin(['F', 'M'])]
df['Vict Sex'].unique()

array(['F', 'M'], dtype=object)


df = df.drop(columns = ['DR_NO', 'Date Rptd', 'Status'])


print('New Shape: ',df.shape)
print(df.dtypes)

New Shape:  (18332, 14)
DATE OCC          datetime64[ns]
TIME OCC                  object
AREA                       int64
AREA NAME                 object
Rpt Dist No                int64
Part 1-2                   int64
Crm Cd                     int64
Vict Age                   int64
Vict Sex                  object
Vict Descent              object
Premis Cd                float64
Weapon Used Cd           float64
LAT                      float64
LON                      float64
dtype: object


contingency_table = pd.crosstab(df['AREA NAME'], df['Vict Sex'])

print(contingency_table)

Vict Sex        F    M
AREA NAME             
77th Street  1040  796
Central       383  578
Devonshire    318  305
Foothill      308  341
Harbor        450  418
Hollenbeck    221  273
Hollywood     404  555
Mission       439  420
N Hollywood   423  568
Newton        412  516
Northeast     181  198
Olympic       392  479
Pacific       398  525
Rampart       346  439
Southeast     797  630
Southwest     721  657
Topanga       371  412
Van Nuys      296  381
West LA       240  283
West Valley   381  460
Wilshire      252  325


female_counts = (df[df['Vict Sex'] == 'F']['AREA NAME'].value_counts()).sort_index()
plt.figure(figsize=(10, 6))

female_counts.plot(kind='bar')
plt.xlabel('Area Name')
plt.ylabel('Number of Female Victims')
plt.title('Number of Female Victims in Different Areas')

plt.ylim(0, 1100)

plt.show()


chi2, p, dof, expected = chi2_contingency(contingency_table)
print(p)

3.063264359078526e-31


df_copy = df.copy()


df_copy[['Hour', 'Minute']] = df_copy['TIME OCC'].str.split(':', expand=True)

df_copy['Hour'] = df_copy['Hour'].astype(int)

morning_time_range = (df_copy['Hour'] >= 6) & (df_copy['Hour'] < 12)  # Morning-> 6:00 to 12:00
noon_time_range = (df_copy['Hour'] >= 12) & (df_copy['Hour'] < 18)    # Noon-> 12:00 to 18:00
night_time_range = ~(morning_time_range | noon_time_range)  # Night-> Everything else

morning_group = df_copy[morning_time_range]['Vict Age']
noon_group = df_copy[noon_time_range]['Vict Age']
night_group = df_copy[night_time_range]['Vict Age']

age_bins = [0, 18, 35, 50, 65, 80, 95, 100]
age_labels = ['0-18', '19-35', '36-50', '51-65', '66-80', '81-95', '96+']

df_copy['Age Category'] = pd.cut(df_copy['Vict Age'], bins=age_bins, labels=age_labels, right=False)

f_statistic, p_value = f_oneway(morning_group, noon_group, night_group)

print("ANOVA - Overall Correlation between Age of Victims and Time of Occurrence:")
print("F-statistic:", f_statistic)
print("P-value:", p_value)

plt.figure(figsize=(15, 6))

# Morning
plt.subplot(1, 3, 1)
df_copy[morning_time_range]['Age Category'].value_counts().sort_index().plot(kind='bar', color='green')
plt.title('Morning')
plt.xlabel('Age Category')
plt.ylabel('Frequency')
plt.ylim(0, 4000)

# Noon
plt.subplot(1, 3, 2)
df_copy[noon_time_range]['Age Category'].value_counts().sort_index().plot(kind='bar', color='blue')
plt.title('Noon')
plt.xlabel('Age Category')
plt.ylabel('Frequency')
plt.ylim(0, 4000)

# Night
plt.subplot(1, 3, 3)
df_copy[night_time_range]['Age Category'].value_counts().sort_index().plot(kind='bar', color='orange')
plt.title('Night')
plt.xlabel('Age Category')
plt.ylabel('Frequency')
plt.ylim(0, 4000)

plt.tight_layout()
plt.show()

ANOVA - Overall Correlation between Age of Victims and Time of Occurrence:
F-statistic: 44.88127328821802
P-value: 3.596530141102993e-20


df_copy = df.copy()

df_copy[['Hour', 'Minute']] = df_copy['TIME OCC'].str.split(':', expand=True)

df_copy['Hour'] = df_copy['Hour'].astype(int)

morning_time_range = (df_copy['Hour'] >= 6) & (df_copy['Hour'] < 12)  # Morning-> 6:00 to 12:00
noon_time_range = (df_copy['Hour'] >= 12) & (df_copy['Hour'] < 18)    # Noon-> 12:00 to 18:00
night_time_range = ~(morning_time_range | noon_time_range)  # Night-> Everything else

morning_group = df_copy[morning_time_range]['Vict Sex'].replace({'M': 0, 'F': 1})
noon_group = df_copy[noon_time_range]['Vict Sex'].replace({'M': 0, 'F': 1})
night_group = df_copy[night_time_range]['Vict Sex'].replace({'M': 0, 'F': 1})

f_statistic, p_value = f_oneway(morning_group, noon_group, night_group)

print("ANOVA - Overall Correlation between Sex of Victims and Time of Occurrence:")
print("F-statistic:", f_statistic)
print("P-value:", p_value)

plt.figure(figsize=(15, 6))

# Morning
plt.subplot(1, 3, 1)
morning_group.value_counts().sort_index().plot(kind='bar', color='green')
plt.title('Morning')
plt.xlabel('Sex of Victim')
plt.ylabel('Frequency')
plt.xticks(ticks=[0, 1], labels=['Male', 'Female'])
plt.ylim(0, 5000)


# Noon
plt.subplot(1, 3, 2)
noon_group.value_counts().sort_index().plot(kind='bar', color='blue')
plt.title('Noon')
plt.xlabel('Sex of Victim')
plt.ylabel('Frequency')
plt.xticks(ticks=[0, 1], labels=['Male', 'Female'])
plt.ylim(0, 5000)


# Night
plt.subplot(1, 3, 3)
night_group.value_counts().sort_index().plot(kind='bar', color='orange')
plt.title('Night')
plt.xlabel('Sex of Victim')
plt.ylabel('Frequency')
plt.xticks(ticks=[0, 1], labels=['Male', 'Female'])
plt.ylim(0, 5000)


plt.tight_layout()
plt.show()

ANOVA - Overall Correlation between Sex of Victims and Time of Occurrence:
F-statistic: 3.4921640567687287
P-value: 0.0304551894145455


male_counts = (df[df['Vict Sex'] == 'M']['AREA NAME'].value_counts()).sort_index()
female_counts = (df[df['Vict Sex'] == 'F']['AREA NAME'].value_counts()).sort_index()

bar_width = 0.35

x = range(len(male_counts))

plt.figure(figsize=(12, 6))
plt.bar(x, male_counts, bar_width, label='Male', color='blue')
plt.bar([i + bar_width for i in x], female_counts, bar_width, label='Female', color='pink')

plt.xlabel('Area Name')
plt.ylabel('Number of Victims')
plt.title('Number of Male and Female Victims in Different Areas')
plt.xticks([i + bar_width/2 for i in x], male_counts.index, rotation=45)
plt.legend()

plt.tight_layout()
plt.show()


m = 0
f = 0
for location in male_counts.index:
    male_count = male_counts[location]
    female_count = female_counts.get(location, 0)
    print(f"{location}, M: {male_count}, F: {female_count}")
    if male_count > female_count:
      m += 1
    else:
      f += 1
print(f"{m} Areas where there is a greater number of Male victims compared to Female.")
print(f"{f} Areas where there is a greater number of Female victims compared to Male.")

77th Street, M: 796, F: 1040
Central, M: 578, F: 383
Devonshire, M: 305, F: 318
Foothill, M: 341, F: 308
Harbor, M: 418, F: 450
Hollenbeck, M: 273, F: 221
Hollywood, M: 555, F: 404
Mission, M: 420, F: 439
N Hollywood, M: 568, F: 423
Newton, M: 516, F: 412
Northeast, M: 198, F: 181
Olympic, M: 479, F: 392
Pacific, M: 525, F: 398
Rampart, M: 439, F: 346
Southeast, M: 630, F: 797
Southwest, M: 657, F: 721
Topanga, M: 412, F: 371
Van Nuys, M: 381, F: 296
West LA, M: 283, F: 240
West Valley, M: 460, F: 381
Wilshire, M: 325, F: 252
15 Areas where there is a greater number of Male victims compared to Female.
6 Areas where there is a greater number of Female victims compared to Male.


# Pandas dataframe
data = pd.DataFrame({"Female Victims": female_counts, "Male Victims": male_counts})

# Plot the dataframe
ax = data[['Female Victims', 'Male Victims']].plot(kind='box', title='boxplot')

# Display the plot
plt.title("Distribution of Male and Female Victims at Each Location")
plt.ylabel("Number of Victims")
plt.show()


display(df.head())
df.columns

Index(['DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2',
       'Crm Cd', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd',
       'Weapon Used Cd', 'LAT', 'LON'],
      dtype='object')


Y = df['Vict Sex']
X = df['AREA']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)


#We create a model using just Area as the independent variable
X_train = np.array(X_train).reshape(-1,1)
X_test = np.array(X_test).reshape(-1,1)
model = LogisticRegression(random_state=0)
model.fit(X_train,Y_train)
results = model.predict(X_test)

#Then we display the confusion matrix, accuracy score, and classification report to determine the performance of our model
display(confusion_matrix(Y_test, results))

print(f"Accuracy: {accuracy_score(Y_test, results)}")
print(classification_report(Y_test, results))

print(np.unique(results))

array([[ 224, 2432],
       [ 264, 2580]])

Accuracy: 0.5098181818181818
              precision    recall  f1-score   support

           F       0.46      0.08      0.14      2656
           M       0.51      0.91      0.66      2844

    accuracy                           0.51      5500
   macro avg       0.49      0.50      0.40      5500
weighted avg       0.49      0.51      0.41      5500

['F' 'M']


#Here, we can take out AREA NAME because it can also be measured b AREA (we did not take it out before because it was used for exploratory analysis)
df = df.drop(columns = ['AREA NAME'])
df.head()


# Here, we are changing the Date and Time columns to processable values for the model
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'])
df['TIME OCC'] = pd.to_datetime(df['TIME OCC'])
df['year'] = df['DATE OCC'].dt.year
df['month'] = df['DATE OCC'].dt.month
df['day'] = df['DATE OCC'].dt.day

# We are using the one-hot encoding to change the categorical variables into binary numerical values for easier processing
df = pd.get_dummies(df, columns = ['Vict Descent', 'AREA', 'Crm Cd', 'Vict Age', 'Weapon Used Cd'])

Y = df['Vict Sex']
X = df.drop(columns = ['Vict Sex', 'DATE OCC', 'TIME OCC'])

# Then, we are splitting the data into training and testing set and scaling them
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# We then create the model and perform cross validation to prevent overfitting
model = LogisticRegression(random_state=0)

scores = cross_val_score(model, X_train_scaled, np.ravel(Y_train))
print(f"Mean: {np.mean(scores)}, Standard Deviation: {np.std(scores)}")

Mean: 0.6664579298191173, Standard Deviation: 0.008320844276428334


#Then we display the confusion matrix, accuracy score, and classification report to determine the performance of our model
model.fit(X_train_scaled,np.ravel(Y_train))
results = model.predict(X_test_scaled)
display(confusion_matrix(Y_test, results))

print(f"Accuracy: {accuracy_score(Y_test, results)}")
print(classification_report(Y_test, results))

array([[1521, 1115],
       [ 688, 2176]])

Accuracy: 0.6721818181818182
              precision    recall  f1-score   support

           F       0.69      0.58      0.63      2636
           M       0.66      0.76      0.71      2864

    accuracy                           0.67      5500
   macro avg       0.67      0.67      0.67      5500
weighted avg       0.67      0.67      0.67      5500


pca = PCA(n_components = 2)
X_pca = pca.fit_transform(X)

#After performing PCA, we redo the previous performance analysis
X_train_pca, X_test_pca, Y_train_pca, Y_test_pca = train_test_split(X_pca, Y, test_size = 0.3)

scaler = StandardScaler()
X_train_scaled_pca = scaler.fit_transform(X_train_pca)
X_test_scaled_pca = scaler.fit_transform(X_test_pca)

model_pca = LogisticRegression(random_state=0)

scores = cross_val_score(model_pca, X_train_scaled, np.ravel(Y_train_pca))
print(f"Mean: {np.mean(scores)}, Standard Deviation: {np.std(scores)}")

model_pca.fit(X_train_scaled_pca,np.ravel(Y_train_pca))
results_pca = model_pca.predict(X_test_scaled_pca)
display(confusion_matrix(Y_test_pca, results_pca))

print(f"Accuracy: {accuracy_score(Y_test_pca, results_pca)}")
print(classification_report(Y_test_pca, results_pca))

Mean: 0.5061568058647119, Standard Deviation: 0.00884306791884173

array([[1516, 1139],
       [1113, 1732]])

Accuracy: 0.5905454545454546
              precision    recall  f1-score   support

           F       0.58      0.57      0.57      2655
           M       0.60      0.61      0.61      2845

    accuracy                           0.59      5500
   macro avg       0.59      0.59      0.59      5500
weighted avg       0.59      0.59      0.59      5500


# Confusion Matrix
def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# ROC Curve
def plot_roc_curve(y_true, y_pred_proba):

    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()


# Assuming you have y_test and predicted probabilities of class 1 (y_pred_proba)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
y_pred = model.predict(X_test_scaled)

# Plotting Confusion Matrix
plot_confusion_matrix(Y_test, y_pred)

# Plotting ROC Curve
# Encode categorical labels to binary format
le = LabelEncoder()
Y_test_binary = le.fit_transform(Y_test)

plot_roc_curve(Y_test_binary, y_pred_proba)

	DR_NO	Date Rptd	DATE OCC	TIME OCC	AREA	AREA NAME	Rpt Dist No	Part 1-2	Crm Cd	Crm Cd Desc	...	Status	Status Desc	Crm Cd 1	Crm Cd 2	Crm Cd 3	Crm Cd 4	LOCATION	Cross Street	LAT	LON
0	190326475	03/01/2020 12:00:00 AM	03/01/2020 12:00:00 AM	2130	7	Wilshire	784	1	510	VEHICLE - STOLEN	...	AA	Adult Arrest	510.0	998.0	NaN	NaN	1900 S LONGWOOD AV	NaN	34.0375	-118.3506
1	200106753	02/09/2020 12:00:00 AM	02/08/2020 12:00:00 AM	1800	1	Central	182	1	330	BURGLARY FROM VEHICLE	...	IC	Invest Cont	330.0	998.0	NaN	NaN	1000 S FLOWER ST	NaN	34.0444	-118.2628
2	200320258	11/11/2020 12:00:00 AM	11/04/2020 12:00:00 AM	1700	3	Southwest	356	1	480	BIKE - STOLEN	...	IC	Invest Cont	480.0	NaN	NaN	NaN	1400 W 37TH ST	NaN	34.0210	-118.3002
3	200907217	05/10/2023 12:00:00 AM	03/10/2020 12:00:00 AM	2037	9	Van Nuys	964	1	343	SHOPLIFTING-GRAND THEFT ($950.01 & OVER)	...	IC	Invest Cont	343.0	NaN	NaN	NaN	14000 RIVERSIDE DR	NaN	34.1576	-118.4387
4	220614831	08/18/2022 12:00:00 AM	08/17/2020 12:00:00 AM	1200	6	Hollywood	666	2	354	THEFT OF IDENTITY	...	IC	Invest Cont	354.0	NaN	NaN	NaN	1900 TRANSIENT	NaN	34.0944	-118.3277

	DR_NO	Date Rptd	DATE OCC	TIME OCC	AREA	AREA NAME	Rpt Dist No	Part 1-2	Crm Cd	Crm Cd Desc	...	Status	Status Desc	Crm Cd 1	Crm Cd 2	Crm Cd 3	Crm Cd 4	LOCATION	Cross Street	LAT	LON
875845	240604934	2024-01-21	2024-01-21	1510	6	Hollywood	668	2	624	BATTERY - SIMPLE ASSAULT	...	IC	Invest Cont	624.0	NaN	NaN	NaN	1300 N WESTERN AV	NaN	34.0944	-118.3125
875846	242107187	2024-03-22	2024-03-22	1815	21	Topanga	2145	2	624	BATTERY - SIMPLE ASSAULT	...	AO	Adult Other	624.0	NaN	NaN	NaN	22000 GILMORE ST	NaN	34.1876	-118.6070
875847	241408080	2024-04-01	2024-04-01	1920	14	Pacific	1432	1	310	BURGLARY	...	AA	Adult Arrest	310.0	998.0	NaN	NaN	800 VENICE BL	NaN	33.9939	-118.4533
875848	240904953	2024-01-28	2024-01-26	1808	9	Van Nuys	932	2	624	BATTERY - SIMPLE ASSAULT	...	IC	Invest Cont	624.0	NaN	NaN	NaN	14800 VICTORY BL	NaN	34.1867	-118.4553
875849	241507368	2024-03-10	2024-03-10	420	15	N Hollywood	1555	1	510	VEHICLE - STOLEN	...	IC	Invest Cont	510.0	NaN	NaN	NaN	ELMER AV	CAMARILLO ST	34.1577	-118.3763

	DR_NO	Date Rptd	DATE OCC	TIME OCC	AREA	AREA NAME	Rpt Dist No	Part 1-2	Crm Cd	Crm Cd Desc	...	Status	Status Desc	Crm Cd 1	Crm Cd 2	Crm Cd 3	Crm Cd 4	LOCATION	Cross Street	LAT	LON
875845	240604934	2024-01-21	2024-01-21	15:10	6	Hollywood	668	2	624	BATTERY - SIMPLE ASSAULT	...	IC	Invest Cont	624.0	NaN	NaN	NaN	1300 N WESTERN AV	NaN	34.0944	-118.3125
875846	242107187	2024-03-22	2024-03-22	18:15	21	Topanga	2145	2	624	BATTERY - SIMPLE ASSAULT	...	AO	Adult Other	624.0	NaN	NaN	NaN	22000 GILMORE ST	NaN	34.1876	-118.6070
875847	241408080	2024-04-01	2024-04-01	19:20	14	Pacific	1432	1	310	BURGLARY	...	AA	Adult Arrest	310.0	998.0	NaN	NaN	800 VENICE BL	NaN	33.9939	-118.4533
875848	240904953	2024-01-28	2024-01-26	18:08	9	Van Nuys	932	2	624	BATTERY - SIMPLE ASSAULT	...	IC	Invest Cont	624.0	NaN	NaN	NaN	14800 VICTORY BL	NaN	34.1867	-118.4553
875849	241507368	2024-03-10	2024-03-10	04:20	15	N Hollywood	1555	1	510	VEHICLE - STOLEN	...	IC	Invest Cont	510.0	NaN	NaN	NaN	ELMER AV	CAMARILLO ST	34.1577	-118.3763

	DR_NO	Date Rptd	DATE OCC	TIME OCC	AREA	AREA NAME	Rpt Dist No	Part 1-2	Crm Cd	Crm Cd Desc	...	Premis Cd	Premis Desc	Weapon Used Cd	Weapon Desc	Status	Status Desc	LOCATION	Cross Street	LAT	LON
875845	240604934	2024-01-21	2024-01-21	15:10	6	Hollywood	668	2	624	BATTERY - SIMPLE ASSAULT	...	517.0	MISSIONS/SHELTERS	400.0	STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)	IC	Invest Cont	1300 N WESTERN AV	NaN	34.0944	-118.3125
875846	242107187	2024-03-22	2024-03-22	18:15	21	Topanga	2145	2	624	BATTERY - SIMPLE ASSAULT	...	102.0	SIDEWALK	400.0	STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)	AO	Adult Other	22000 GILMORE ST	NaN	34.1876	-118.6070
875847	241408080	2024-04-01	2024-04-01	19:20	14	Pacific	1432	1	310	BURGLARY	...	502.0	MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)	NaN	NaN	AA	Adult Arrest	800 VENICE BL	NaN	33.9939	-118.4533
875848	240904953	2024-01-28	2024-01-26	18:08	9	Van Nuys	932	2	624	BATTERY - SIMPLE ASSAULT	...	210.0	RESTAURANT/FAST FOOD	500.0	UNKNOWN WEAPON/OTHER WEAPON	IC	Invest Cont	14800 VICTORY BL	NaN	34.1867	-118.4553
875849	241507368	2024-03-10	2024-03-10	04:20	15	N Hollywood	1555	1	510	VEHICLE - STOLEN	...	101.0	STREET	NaN	NaN	IC	Invest Cont	ELMER AV	CAMARILLO ST	34.1577	-118.3763

	DR_NO	Date Rptd	DATE OCC	TIME OCC	AREA	AREA NAME	Rpt Dist No	Part 1-2	Crm Cd	Mocodes	Vict Age	Vict Sex	Vict Descent	Premis Cd	Weapon Used Cd	Status	LOCATION	Cross Street	LAT	LON
875845	240604934	2024-01-21	2024-01-21	15:10	6	Hollywood	668	2	624	1822 0400 0416	22	F	B	517.0	400.0	IC	1300 N WESTERN AV	NaN	34.0944	-118.3125
875846	242107187	2024-03-22	2024-03-22	18:15	21	Topanga	2145	2	624	0448 2021 0361 0603	58	M	O	102.0	400.0	AO	22000 GILMORE ST	NaN	34.1876	-118.6070
875847	241408080	2024-04-01	2024-04-01	19:20	14	Pacific	1432	1	310	0344 1402	34	F	O	502.0	NaN	AA	800 VENICE BL	NaN	33.9939	-118.4533
875848	240904953	2024-01-28	2024-01-26	18:08	9	Van Nuys	932	2	624	0447 0416	26	F	H	210.0	500.0	IC	14800 VICTORY BL	NaN	34.1867	-118.4553
875849	241507368	2024-03-10	2024-03-10	04:20	15	N Hollywood	1555	1	510	NaN	0	NaN	NaN	101.0	NaN	IC	ELMER AV	CAMARILLO ST	34.1577	-118.3763

2024 Crime Data In Los Angeles¶

Spring 2024 Data Science Project¶

Introduction¶

Data Collection¶

Data Preprocessing¶

Statistical Method 1: Chi-Squared Test:¶

Statistical Method 2: ANOVA - Test:¶

Statistical Method 3: Box and Whisker Plot¶

Choosing an ML Technique¶

Insights & Conclusions¶