# Data storage and processing
import math
import datetime as dt
import utm
import scipy
import numpy as np
import pandas as pd
from collections import Counter

# Plots and visualizations
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
from kneed import KneeLocator

# Machine learning
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN

tortoise = pd.read_csv("tortoise.csv")
tortoise.head()

# Make time column
tortoise.insert(2, 'TimeSinceZero', range(0, len(tortoise)))
# Make a DateTime object for the time column, which is also useful
tortoise['DateTime'] = tortoise['TimeSinceZero'].astype('timedelta64[s]') + dt.datetime(2024,1,1,0,0)
# Make an animal ID column, which will be useful later on for counting
tortoise['AnimalID'] = 1

tortoise.head()

convert = lambda row: utm.to_latlon(row['X'], -1*row['Y'], 40, northern=False)
latlong = tortoise.apply(convert, axis=1)

tortoise[['Lat', 'Long']] = pd.DataFrame(latlong.tolist(), index=tortoise.index)
tortoise.head(10)

# number of data points
print("Number of data points: ", len(tortoise))

# minimum and maximum of each location
print("X range: ", min(tortoise['X']), max(tortoise['X']))
print("Y range: ", min(tortoise['Y']), max(tortoise['Y']))

Number of data points:  990860
X range:  582488.314805739 582628.66131287
Y range:  -2195359.09875836 -2194908.79436392

def haversine_vectorize(lon1, lat1, lon2, lat2):
    # Returns distance, in meters, between one set of longitude/latitude coordinates and another
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
 
    newlon = lon2 - lon1
    newlat = lat2 - lat1
 
    haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2
 
    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    km = 6367 * dist #6367 for distance in KM for miles use 3958
    return km*1000

# Take speed as change in distance over the past minute
tortoise['Speed(m/min)'] = haversine_vectorize(tortoise.Long, tortoise.Lat,
 tortoise.Long.shift(60), tortoise.Lat.shift(60)).fillna(0)

# Converting the time to days in a separate column
tortoise['timeDays'] = tortoise['TimeSinceZero']/86400

ax = tortoise.plot(x='timeDays', y='Speed(m/min)',figsize=(15,5), color="green")
ax.set_xlabel("Time (Days)")
ax.set_ylabel("Speed (meters per minute)")
ax.set_title("Tortoise Speed over Time")
ax.get_legend().remove()
plt.xticks(np.arange(0,13,1.0))
ax.grid(True, axis='x', which='major')

def t(seconds):
    ofday = seconds%(60*60*24) # time of day (in seconds)
    if ofday <= (60*60*6): # during first six hours
        return "morning"
    elif ofday <= (60*60*12): # during first twelve hours, but not first six
        return "afternoon"
    elif ofday <= (60*60*18): # during first eighteen hours, but not first twelve
        return "evening"
    else: # during final six hours
        return "night"


tortoise['timeOfDay'] = tortoise['TimeSinceZero'].apply(t)

morning = tortoise['Speed(m/min)'][tortoise['timeOfDay']=='morning']
afternoon = tortoise['Speed(m/min)'][tortoise['timeOfDay']=='afternoon']
evening = tortoise['Speed(m/min)'][tortoise['timeOfDay'] == 'evening']
night = tortoise['Speed(m/min)'][tortoise['timeOfDay'] == 'night']

print(scipy.stats.f_oneway(morning, afternoon, evening, night))

F_onewayResult(statistic=59041.457362993075, pvalue=0.0)

print(scipy.stats.tukey_hsd(morning, afternoon, evening, night))

Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)      0.094     0.000     0.093     0.096
 (0 - 2)      0.150     0.000     0.149     0.151
 (0 - 3)      0.145     0.000     0.144     0.146
 (1 - 0)     -0.094     0.000    -0.096    -0.093
 (1 - 2)      0.055     0.000     0.054     0.056
 (1 - 3)      0.051     0.000     0.049     0.052
 (2 - 0)     -0.150     0.000    -0.151    -0.149
 (2 - 1)     -0.055     0.000    -0.056    -0.054
 (2 - 3)     -0.005     0.000    -0.006    -0.004
 (3 - 0)     -0.145     0.000    -0.146    -0.144
 (3 - 1)     -0.051     0.000    -0.052    -0.049
 (3 - 2)      0.005     0.000     0.004     0.006

print("Morning: ", morning.mean())
print("Afternoon: ", afternoon.mean())
print("Evening: ", evening.mean())
print("Night: ", night.mean())

fig, ax = plt.subplots()
ind = np.arange(4) 
width = 0.7
means = [morning.mean(), afternoon.mean(), evening.mean(), night.mean()]
rects = ax.bar(ind, means, width, color='g')
ax.set_ylabel('Sample Mean Speed (meters/minute)')
ax.set_title('Sample Mean Speed for each Time Category')
ax.set_xticks(ind)
ax.set_xticklabels(('Morning', 'Afternoon', 'Evening', 'Night'))
ax.set_ylim([0,0.2])
for rect in rects:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                str(int(height*10000)/10000),
                ha='center', va='bottom')

Morning:  0.1728025598387896
Afternoon:  0.07831414332904804
Evening:  0.022951714499125055
Night:  0.027771349415464115

tortoise['DateTime'] = tortoise['DateTime'] + dt.timedelta(hours=6)

ax = tortoise['Speed(m/min)'].hist(bins=30, color="green")
ax.set_xlabel("Speed (meters/minute)")
ax.set_title("Histogram of Tortoise Speed")
plt.show()

def speed_class(speed):
    if speed<0.05:
        return 0
    elif speed < 0.2:
        return 1
    elif speed < 0.6:
        return 2
    else:
        return 3

tortoise['speedCategory'] = tortoise['Speed(m/min)'].apply(speed_class)

ax = tortoise.plot(x='timeDays', y='speedCategory', kind='scatter', s=1, color="green")
ax.grid(True, axis='x')
ax.set_xlabel("Number of Days")
ax.set_ylabel("Speed Category (0-3)")
ax.set_title("Speed Category over Time")
plt.yticks(np.arange(0, 4, 1.0))
plt.xticks(np.arange(0,13,1.0))
plt.show()

tortoise['Speed(m/s)'] = haversine_vectorize(tortoise.Long, tortoise.Lat,
 tortoise.Long.shift(1), tortoise.Lat.shift(1)).fillna(0)

# Taking the integer rounds off the time in days, assigning each point the day number
tortoise['dayNum'] = tortoise['timeDays'].astype(int)

# Total distance traveled over each day
movementByDay = tortoise.groupby('dayNum')['Speed(m/s)'].sum()

ax = movementByDay.plot(color="green")
ax.set_title("Distance traveled each Day over Time")
ax.set_xlabel("Time in Days")
ax.set_ylabel("Distance per Day (meters)")
plt.show()

# Find the line of best fit
slope, intercept, rvalue, pvalue, stderr = scipy.stats.linregress(range(0, 12), movementByDay)

plot1, ax = plt.subplots()
plot1 = matplotlib.pyplot.scatter(range(0, 12), movementByDay, color="green")

x = range(0, 12)
y = slope*(x) + intercept
plot1 = matplotlib.pyplot.plot(x,y, color='blue')
ax.set_xlabel("Time in Days")
ax.set_ylabel("Distance per Day (meters)")
ax.set_title("Linear Regression for Distance per Day over Days")

plot1
print("Distance = ", slope, "* Days + ", intercept)
print ("r = ", rvalue)

Distance =  -16.857338198721422 * Days +  201.9866665975539
r =  -0.5050858802645146

fig, axs = matplotlib.pyplot.subplots(figsize = (5,10))
axs.axis('equal')
axs.set_title("Map of Tortoise Location over Time")
axs.text(.40, 1.80, 'Each square in \nthe grid is 20x20 \nmeters.', fontsize=9, transform=ax.transAxes, bbox=dict(facecolor='white', alpha=0.5))


axs.grid(True)
fig = plt.scatter(tortoise['X'],tortoise['Y'], c=tortoise['TimeSinceZero'], cmap='plasma', s=1)
axs.set_xticks(np.arange(582420, 582680, 20))
axs.set_xticklabels([])
axs.set_yticks(np.arange(-2195400, -2194860, 20))
axs.set_yticklabels([])

plt.show()

def get_area(row):
    x = row['X']
    y = row['Y']
    x_final = int((x-582470)/10)
    y_final = int((y+2195400)/20)
    return (x_final, y_final)

tortoise['area'] = tortoise.apply(get_area, axis=1)

# Find the amount of time he spends in each area
areacounts = tortoise.groupby('area')['AnimalID'].sum()

# Filter out all areas that he spends more than 20000 seconds in
totaltime = areacounts.sum()
areacounts2 = areacounts.loc[lambda x : x >= 20000]
timeleft = areacounts2.sum()

# Create an 'other' category for the areas we filtered out
print(totaltime-timeleft)
areacounts2.at['other']= totaltime-timeleft

areacounts2pct = (areacounts2*100)/sum(areacounts2)
fig, ax = plt.subplots()
ax.set_title("Percent Time Spent in Each Area")
ax.set_xlabel("Area")
ax.set_ylabel("Percent of time in Area")
rects = ax.bar(np.arange(9), areacounts2pct, 0.5, color='g')
ax.set_xticks(np.arange(9))
ax.set_xticklabels(areacounts2pct.index, rotation=45, ha='right')
ax.set_ylim((0,40))
for rect in rects:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                str(int(height*10)/10),
                ha='center', va='bottom')
                
plt.show()

277287

areacountsdf = pd.DataFrame(areacounts)
areacountsdf  = areacountsdf.rename(columns={"AnimalID":"SampleOccurences"})

dof = len(areacountsdf)-1
# Since the default distribution for chisquare is uniform, we don't have to specify anything
scipy.stats.chisquare(areacountsdf['SampleOccurences'], ddof=dof)

Power_divergenceResult(statistic=10476917.027180428, pvalue=nan)

areacounts = areacounts.loc[lambda x : x >= 6*60*60]

areacountsdf = pd.DataFrame(areacounts)
areacountsdf = areacountsdf.rename(columns={"AnimalID":"SampleOccurences"})
dof = len(areacountsdf)-1
scipy.stats.chisquare(areacountsdf['SampleOccurences'], ddof=dof)

Power_divergenceResult(statistic=777413.591017317, pvalue=nan)

x = [4, 3, 2]
y = [24, 13, 19]
data = {'X': x, 'Y': y}
sigpoints = pd.DataFrame(data)

sigpoints['X'] = (sigpoints['X']*10)+582470
sigpoints['Y'] = (sigpoints['Y']*20)-2195400

fig, axs = matplotlib.pyplot.subplots(figsize = (5,10))
axs.axis('equal')

add = lambda row: axs.add_patch(plt.Rectangle((row['X'], row['Y']), 10, 20))
sigpoints.apply(add, axis=1)

axs.set_title("Map of Tortoise Location over Time")
axs.text(.40, 1.80, 'Each square in \nthe grid is 20x20 \nmeters.', fontsize=9, transform=ax.transAxes, bbox=dict(facecolor='white', alpha=0.5))


axs.grid(True)
fig = plt.scatter(tortoise['X'],tortoise['Y'], c=tortoise['TimeSinceZero'], cmap='plasma', s=1)
axs.set_xticks(np.arange(582420, 582680, 20))
axs.set_xticklabels([])
axs.set_yticks(np.arange(-2195400, -2194860, 20))
axs.set_yticklabels([])
plt.show()

# Add a column for time (in seconds) of the day
tortoise['Time'] = tortoise['TimeSinceZero']%(60*60*24)
# Note that this means each day "splits" at 6am, which is ok because we see a sharp behavioral change at 6am anyway.

# Add a column for mean speed over the last five minutes (in meters/min)
tortoise['aggregateSpeed(m/min)'] = (.2)*haversine_vectorize(tortoise.Long, tortoise.Lat,
 tortoise.Long.shift(300), tortoise.Lat.shift(300)).fillna(0)

# Trim the dataset to one point every minute
tortoiseOld = tortoise.copy()
tortoise = tortoiseOld[tortoiseOld['TimeSinceZero'] % 60 == 0]
print("New length of dataset: ", len(tortoise))


 # Add a column for how many seconds the tortoise spends within 20 meters of his current location
 # recall that the haversine_vectorize function calculates distance between two lat-longs
def count_adjacent_rows(index, df):
    current_row = df.loc[index]
    count = 0
    for i in range(index - 60, -60, -60):
        distance = haversine_vectorize(current_row['Long'], current_row['Lat'], df.loc[i]['Long'], df.loc[i]['Lat'])
        if distance <= 20:
            count += 1
        else:
            break
    for i in range(index + 60, len(df)*60, 60):
        distance = haversine_vectorize(current_row['Long'], current_row['Lat'], df.loc[i]['Long'], df.loc[i]['Lat'])
        if distance <= 20:
            count += 1
        else:
            break
    return count

tortoise = tortoise.assign(**{'timeNearby':tortoise.apply(lambda row: count_adjacent_rows(row.name, tortoise), axis=1)})

#tortoise.to_csv('tortoiseoutput.csv', index=False)

New length of dataset:  16515

#tortoise = pd.read_csv('tortoiseoutput.csv')

newtortoise = tortoise[['Time', 'aggregateSpeed(m/min)', 'timeNearby']]

# Display the correlation matrix using a heatmap
plt.figure(figsize=(4, 4))
fig, ax = plt.subplots()
ax.set_title("Heatmap of Tortoise Features")
sns.heatmap(newtortoise.corr(), cmap="YlGnBu", annot=True)
plt.show()

<Figure size 400x400 with 0 Axes>

# Scale the data
scaler = StandardScaler()
newtortoise = pd.DataFrame(
    scaler.fit_transform(newtortoise), columns=['Time', 'aggregateSpeed(m/min)', 'timeNearby']
)

# Perform PCA
pcaModel = PCA(n_components=3)
pcaModel.fit(newtortoise)
print(pcaModel.explained_variance_ratio_)

[0.5561368  0.29258821 0.15127499]

pca = PCA(n_components = 3) 
tortoise_PCA = pca.fit_transform(newtortoise) 
tortoise_PCA = pd.DataFrame(tortoise_PCA) 
tortoise_PCA.columns = ['P1', 'P2', 'P3'] 

print(tortoise_PCA.head())

         P1        P2        P3
0  1.262829 -0.659211 -1.749047
1  1.261455 -0.657479 -1.747995
2  1.260082 -0.655748 -1.746943
3  1.258438 -0.654272 -1.745619
4  1.257065 -0.652540 -1.744568

# Find k-value
k = 2 * tortoise_PCA.shape[-1] - 1
# Take KNN on the data
nbrs = NearestNeighbors(n_neighbors=k, radius=1).fit(tortoise_PCA)
# Store and sort the distances
distances, indices = nbrs.kneighbors(tortoise_PCA) 
distances = np.sort(distances, axis=0)
numdist = len(distances)
distances = distances[:, k-1]

# Make the elbow plot
fig, ax = plt.subplots(figsize=(6,6))
fig = plt.plot(distances, color='g')
ax.set_xlabel('Number of Points in the dataset', fontsize=12)
ax.set_ylabel('3-nearest neighbor distance'.format(k), fontsize=12)
ax.set_title('Elbow Plot for KNN distances')
ax.set_yticks(np.arange(0, 1.2, 0.05))
ax.grid(visible=True)
plt.show()

# Set up and find the y-value of the elbow point

kneedle = KneeLocator(x = range(1, numdist+1), y = distances, S = 1.0, 
                      curve = "concave", direction = "increasing", online=True)

print(kneedle.knee_y)

0.19342877260316022

# Set hyperparameters
epsilon = 0.2
min_samples = 6

# Initialize and fit the model
db_start = DBSCAN(eps = epsilon, min_samples = min_samples).fit(tortoise_PCA) 
labels = db_start.labels_ 

# Print summary of the clustering
nclusters = len(set(labels)) - (1 if -1 in labels else 0)
print("Number of clusters: ", nclusters)
print("Number of outliers: ", list(labels).count(-1))
counts = Counter(db_start.labels_)
for i in range(0, nclusters):
    print("Cluster", i, ":", counts[i], "items")

Number of clusters:  18
Number of outliers:  78
Cluster 0 : 7200 items
Cluster 1 : 8 items
Cluster 2 : 9 items
Cluster 3 : 356 items
Cluster 4 : 130 items
Cluster 5 : 21 items
Cluster 6 : 31 items
Cluster 7 : 5413 items
Cluster 8 : 27 items
Cluster 9 : 24 items
Cluster 10 : 16 items
Cluster 11 : 33 items
Cluster 12 : 9 items
Cluster 13 : 907 items
Cluster 14 : 2041 items
Cluster 15 : 98 items
Cluster 16 : 102 items
Cluster 17 : 12 items

p = sns.scatterplot(data = tortoise_PCA, x = "P1", y = "P2", hue = db_start.labels_, legend = "full", palette = "muted", linewidth = 0)
sns.move_legend(p, "upper right", bbox_to_anchor = (1.17, 1.), title = 'Clusters')
p.set(title="Initial Clustering Results")
plt.show()

# Lists to store the hyperparameters and models
possible_epsilon = np.arange(0,10)
possible_min_samples = np.arange(0,3)
models = []

def get_model(epsilon, min_s):
    return DBSCAN(eps = ((epsilon*0.05)+0.05), min_samples = (min_s+5)).fit(tortoise_PCA) 

def get_summarystats(epsilon, min_s, verbose=True):
    global models

    eps = ((epsilon*0.05)+.05)
    ms = min_s+5
    model = models[epsilon][min_s]

    labels = model.labels_ 
    nclusters = len(set(labels)) - (1 if -1 in labels else 0)   

    
    counts = Counter(model.labels_)
    if verbose:
        print(f"Epsilon={eps:.2f}, min_samples={ms}: {nclusters} clusters and {list(labels).count(-1)} outliers")
        print(list(counts.values()))
    else:
        total = 0
        for i in counts.values():
            if i >= 1000:
                total += 1
        print(f"Epsilon={eps:.2f}, min_samples={ms}: {nclusters} clusters, {list(labels).count(-1)} outliers, and {total} significant groups")
    print("")

# Create all the models and print summary statistics of each
for i in possible_epsilon:
    models.append([])
    for j in possible_min_samples:
        models[i].append(get_model(i, j))
        get_summarystats(i,j, verbose=False)

Epsilon=0.05, min_samples=5: 71 clusters, 503 outliers, and 4 significant groups

Epsilon=0.05, min_samples=6: 64 clusters, 587 outliers, and 4 significant groups

Epsilon=0.05, min_samples=7: 59 clusters, 672 outliers, and 3 significant groups

Epsilon=0.10, min_samples=5: 44 clusters, 187 outliers, and 6 significant groups

Epsilon=0.10, min_samples=6: 40 clusters, 234 outliers, and 6 significant groups

Epsilon=0.10, min_samples=7: 38 clusters, 259 outliers, and 6 significant groups

Epsilon=0.15, min_samples=5: 23 clusters, 98 outliers, and 4 significant groups

Epsilon=0.15, min_samples=6: 22 clusters, 110 outliers, and 4 significant groups

Epsilon=0.15, min_samples=7: 23 clusters, 119 outliers, and 5 significant groups

Epsilon=0.20, min_samples=5: 20 clusters, 63 outliers, and 3 significant groups

Epsilon=0.20, min_samples=6: 18 clusters, 78 outliers, and 3 significant groups

Epsilon=0.20, min_samples=7: 17 clusters, 89 outliers, and 3 significant groups

Epsilon=0.25, min_samples=5: 16 clusters, 38 outliers, and 3 significant groups

Epsilon=0.25, min_samples=6: 15 clusters, 46 outliers, and 3 significant groups

Epsilon=0.25, min_samples=7: 16 clusters, 48 outliers, and 3 significant groups

Epsilon=0.30, min_samples=5: 13 clusters, 23 outliers, and 3 significant groups

Epsilon=0.30, min_samples=6: 13 clusters, 31 outliers, and 3 significant groups

Epsilon=0.30, min_samples=7: 13 clusters, 33 outliers, and 3 significant groups

Epsilon=0.35, min_samples=5: 10 clusters, 18 outliers, and 3 significant groups

Epsilon=0.35, min_samples=6: 11 clusters, 23 outliers, and 3 significant groups

Epsilon=0.35, min_samples=7: 12 clusters, 24 outliers, and 3 significant groups

Epsilon=0.40, min_samples=5: 10 clusters, 8 outliers, and 3 significant groups

Epsilon=0.40, min_samples=6: 9 clusters, 14 outliers, and 3 significant groups

Epsilon=0.40, min_samples=7: 10 clusters, 14 outliers, and 3 significant groups

Epsilon=0.45, min_samples=5: 9 clusters, 4 outliers, and 3 significant groups

Epsilon=0.45, min_samples=6: 9 clusters, 5 outliers, and 3 significant groups

Epsilon=0.45, min_samples=7: 10 clusters, 5 outliers, and 3 significant groups

Epsilon=0.50, min_samples=5: 6 clusters, 2 outliers, and 2 significant groups

Epsilon=0.50, min_samples=6: 6 clusters, 3 outliers, and 2 significant groups

Epsilon=0.50, min_samples=7: 6 clusters, 3 outliers, and 2 significant groups

possible_epsilon = np.arange(5,9)
min_samples = 0

fig, axs = plt.subplots(2, 2, figsize=(16,16))

p1 = sns.scatterplot(data = tortoise_PCA, x = "P1", y = "P2", hue = models[5][0].labels_, legend = "full", palette = "muted", linewidth = 0, ax = axs[0,0])
sns.move_legend(p1, "upper right", bbox_to_anchor = (1.17, 1.), title = 'Clusters')
axs[0, 0].set_title(f"Clustering Results with epsilon={(5*0.05)+0.05:.2f} and min_samples=5")

p2 = sns.scatterplot(data = tortoise_PCA, x = "P1", y = "P2", hue = models[6][0].labels_, legend = "full", palette = "muted", linewidth = 0, ax = axs[0,1])
sns.move_legend(p2, "upper right", bbox_to_anchor = (1.17, 1.), title = 'Clusters')
axs[0, 1].set_title(f"Clustering Results with epsilon={(6*0.05)+0.05:.2f} and min_samples=5")

p3 = sns.scatterplot(data = tortoise_PCA, x = "P1", y = "P2", hue = models[7][0].labels_, legend = "full", palette = "muted", linewidth = 0, ax = axs[1,0])
sns.move_legend(p3, "upper right", bbox_to_anchor = (1.17, 1.), title = 'Clusters')
axs[1, 0].set_title(f"Clustering Results with epsilon={(7*0.05)+0.05:.2f} and min_samples=5")

p4 = sns.scatterplot(data = tortoise_PCA, x = "P1", y = "P2", hue = models[8][0].labels_, legend = "full", palette = "muted", linewidth = 0, ax = axs[1,1])
sns.move_legend(p, "upper right", bbox_to_anchor = (1.17, 1.), title = 'Clusters')
axs[1, 1].set_title(f"Clustering Results with epsilon={(9*0.05)+0.05:.2f} and min_samples=5")

plt.show()


get_summarystats(5, 0)
get_summarystats(6, 0)
get_summarystats(7, 0)
get_summarystats(8, 0)

Epsilon=0.30, min_samples=5: 13 clusters and 23 outliers
[5, 7378, 23, 22, 367, 24, 31, 5413, 27, 16, 45, 2951, 111, 102]

Epsilon=0.35, min_samples=5: 10 clusters and 18 outliers
[5, 7750, 18, 22, 24, 31, 5413, 27, 16, 3098, 111]

Epsilon=0.40, min_samples=5: 10 clusters and 8 outliers
[5, 7753, 8, 25, 24, 32, 5416, 27, 16, 3098, 111]

Epsilon=0.45, min_samples=5: 9 clusters and 4 outliers
[7761, 4, 25, 24, 32, 5416, 28, 16, 3098, 111]

realmodel = models[7][0]

tortoise = tortoise.assign(**{'Behavior':realmodel.labels_})

ax = tortoise['Behavior'].value_counts().sort_index().plot(kind='bar', color='g')
ax.set_title("Counts for Tortoise Behavior")
ax.set_ylabel("Counts")
plt.show()

ax = tortoise.plot(x='timeDays', y='Behavior', kind='scatter', s=1, color="green")
ax.set_xlabel("Number of Days")
ax.set_ylabel("Behavior")
ax.set_title("Behavior over Time")
plt.yticks(np.arange(0, 10, 1.0))
plt.xticks(np.arange(0,13,1.0))
plt.show()

def reclassify(x):
    if x == -1 or x == 0 or x == 2 or x == 1:
        return 1
    elif x == 3 or x == 4:
        return 3
    elif x == 5 or x == 6:
        return 5
    elif x == 7 or x == 9 or x == 8:
        return 8
    
    
    
tortoise = tortoise.assign(**{'Behavior':tortoise['Behavior'].apply(reclassify)})

ax = tortoise.plot(x='timeDays', y='Behavior', kind='scatter', s=1, color="green")
ax.set_xlabel("Number of Days")
ax.set_ylabel("Behavior")
ax.set_title("Behavior over Time")
plt.yticks(np.arange(0, 9, 1.0))
plt.xticks(np.arange(0,13,1.0))
plt.show()

fig, axs = plt.subplots(2, 2, figsize=(16,16))

p1 = tortoise['Speed(m/min)'][tortoise['Behavior'] == 1].hist(bins=30, color="green", ax = axs[0,0])
axs[0,0].set_xlabel("Speed (meters/minute)")
axs[0,0].set_title("Histogram of Tortoise Speed, Behavior=1")
axs[0,0].set_xlim([0, 0.9])

p2 = tortoise['Speed(m/min)'][tortoise['Behavior'] == 3].hist(bins=30, color="green", ax = axs[0,1])
axs[0,1].set_xlabel("Speed (meters/minute)")
axs[0,1].set_title("Histogram of Tortoise Speed, Behavior=3")
axs[0,1].set_xlim([0, 0.9])

p3 = tortoise['Speed(m/min)'][tortoise['Behavior'] == 5].hist(bins=30, color="green", ax = axs[1,0])
axs[1, 0].set_xlabel("Speed (meters/minute)")
axs[1, 0].set_title("Histogram of Tortoise Speed, Behavior=5")
axs[1, 0].set_xlim([0, 0.9])


p4 = tortoise['Speed(m/min)'][tortoise['Behavior'] == 8].hist(bins=30, color="green", ax = axs[1,1])
axs[1, 1].set_xlabel("Speed (meters/minute)")
axs[1, 1].set_title("Histogram of Tortoise Speed, Behavior=8")
axs[1, 1].set_xlim([0, 0.9])

plt.show()

fig, axs = plt.subplots(2, 2, figsize=(16,16))

p1 = tortoise['timeNearby'][tortoise['Behavior'] == 1].hist(bins=30, color="green", ax = axs[0,0])
axs[0,0].set_xlabel("Minutes Spent within 20 Meters")
axs[0,0].set_title("Histogram of Time Nearby, Behavior=1")
axs[0,0].set_xlim([0, 6500])


p2 = tortoise['timeNearby'][tortoise['Behavior'] == 3].hist(bins=30, color="green", ax = axs[0,1])
axs[0,1].set_xlabel("Minutes Spent within 20 Meters")
axs[0,1].set_title("Histogram of Time Nearby, Behavior=3")
axs[0,1].set_xlim([0, 6500])

p3 = tortoise['timeNearby'][tortoise['Behavior'] == 5].hist(bins=10, color="green", ax = axs[1,0])
axs[1, 0].set_xlabel("Minutes Spent within 20 Meters")
axs[1, 0].set_title("Histogram of Time Nearby, Behavior=5")
axs[1, 0].set_xlim([0, 6500])


p4 = tortoise['timeNearby'][tortoise['Behavior'] == 8].hist(bins=30, color="green", ax = axs[1,1])
axs[1, 1].set_xlabel("Minutes Spent within 20 Meters")
axs[1, 1].set_title("Histogram of Time Nearby, Behavior=8")
axs[1, 1].set_xlim([0, 6500])

plt.show()

fig, axs = matplotlib.pyplot.subplots(figsize = (5,10))
axs.axis('equal')
axs.set_title("Map of Tortoise Location over Time by Behavior")
axs.text(.4, 1.55, 'Each square in \nthe grid is 20x20 \nmeters.', fontsize=9, transform=ax.transAxes, bbox=dict(facecolor='white', alpha=0.5))


axs.grid(True)
plt.scatter(tortoise['X'][tortoise['Behavior']==1],tortoise['Y'][tortoise['Behavior']==1], c='red', s=1)
plt.scatter(tortoise['X'][tortoise['Behavior']==3],tortoise['Y'][tortoise['Behavior']==3], c='green', s=1)
plt.scatter(tortoise['X'][tortoise['Behavior']==5],tortoise['Y'][tortoise['Behavior']==5], c='blue', s=1)
plt.scatter(tortoise['X'][tortoise['Behavior']==8],tortoise['Y'][tortoise['Behavior']==8], c='orange', s=1)

axs.set_xticks(np.arange(582420, 582680, 20))
axs.set_xticklabels([])
axs.set_yticks(np.arange(-2195400, -2194860, 20))
axs.set_yticklabels([])

custom_lines = [Line2D([0], [0], color='red', lw=4),
                Line2D([0], [0], color='green', lw=4),
                Line2D([0], [0], color='blue', lw=4),
                Line2D([0], [0], color='orange', lw=4)]

axs.legend(custom_lines, ['Behavior 1', 'Behavior 3', 'Behavior 5', 'Behavior 8'])
plt.show()

def reclassify(x):
    if x == 1:
        return "Traveling"
    elif x == 3:
        return "Transition"
    elif x == 5:
        return "Resting"
    elif x == 8:
        return "Foraging"
    
    
# Reclassify to add words to each behavior    
tortoise = tortoise.assign(**{'Behavior':tortoise['Behavior'].apply(reclassify)})

	X	Y
0	582628.660871	-2.195359e+06
1	582628.661313	-2.195359e+06
2	582628.657275	-2.195359e+06
3	582628.653591	-2.195359e+06
4	582628.649807	-2.195359e+06

	X	Y	TimeSinceZero	DateTime	AnimalID
0	582628.660871	-2.195359e+06	0	2024-01-01 00:00:00	1
1	582628.661313	-2.195359e+06	1	2024-01-01 00:00:01	1
2	582628.657275	-2.195359e+06	2	2024-01-01 00:00:02	1
3	582628.653591	-2.195359e+06	3	2024-01-01 00:00:03	1
4	582628.649807	-2.195359e+06	4	2024-01-01 00:00:04	1

	X	Y	TimeSinceZero	DateTime	AnimalID	Lat	Long
0	582628.660871	-2.195359e+06	0	2024-01-01 00:00:00	1	-70.334233	59.200370
1	582628.661313	-2.195359e+06	1	2024-01-01 00:00:01	1	-70.334233	59.200370
2	582628.657275	-2.195359e+06	2	2024-01-01 00:00:02	1	-70.334233	59.200370
3	582628.653591	-2.195359e+06	3	2024-01-01 00:00:03	1	-70.334233	59.200370
4	582628.649807	-2.195359e+06	4	2024-01-01 00:00:04	1	-70.334233	59.200370
5	582628.645878	-2.195359e+06	5	2024-01-01 00:00:05	1	-70.334233	59.200370
6	582628.641846	-2.195359e+06	6	2024-01-01 00:00:06	1	-70.334233	59.200369
7	582628.637819	-2.195359e+06	7	2024-01-01 00:00:07	1	-70.334233	59.200369
8	582628.633810	-2.195359e+06	8	2024-01-01 00:00:08	1	-70.334233	59.200369
9	582628.629736	-2.195359e+06	9	2024-01-01 00:00:09	1	-70.334233	59.200369

Movement Analysis of the Aldabra Giant Tortoise¶

By Erin Hopper¶

Introduction¶

Data Curation¶

Exploratory Analysis¶

Exploration 1: Activity levels over time of day¶

Exploration 2: Activity levels of each day¶

Exploration 3: Sites of Interest¶

Primary Analysis and Visualizations¶

Preparing the Data¶

Principal Component Analysis and DBScan¶

Interpretation¶

Conclusions¶