Network Latency Analysis

Python Simulation and Visualization in Google Colab

I’ll solve a network communication example using $Python$ in $Google$ $Colab$.

I’ll show the source code, explain it, and create visualization graphs to illustrate the results.

Let’s explore a network latency analysis example where we’ll simulate ping times to different servers, analyze the data, and visualize the results.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime
import random
from IPython.display import display

# Set the aesthetic style for plots
sns.set(style="whitegrid")
plt.rcParams.update({'font.size': 12})

# Function to simulate ping to different servers
def simulate_ping(server, num_pings=10):
"""Simulate ping times to a server with realistic patterns"""
# Base latency depends on server location
base_latency = {
'Local': 5,
'Regional': 25,
'International': 120,
'Cloud-A': 15,
'Cloud-B': 45
}

# Simulate variation in ping times
base = base_latency[server]
jitter = np.random.normal(0, base * 0.1, num_pings) # 10% jitter
packet_loss = np.random.choice([1, 0], num_pings, p=[0.05, 0.95]) # 5% packet loss

# Add occasional network congestion spikes
congestion = np.zeros(num_pings)
if random.random() < 0.3: # 30% chance of congestion during the test
congestion_start = random.randint(0, num_pings - 3)
congestion_duration = random.randint(1, 3)
congestion[congestion_start:congestion_start+congestion_duration] = base * 0.5

# Calculate ping times
ping_times = base + jitter + congestion

# Apply packet loss (set to NaN)
ping_times[packet_loss == 1] = np.nan

return ping_times

# Simulate data collection
def collect_network_data(servers, samples_per_server=60):
"""Collect simulated network data for multiple servers"""
data = []

print("Collecting network data...")
for server in servers:
print(f"Pinging {server}...")
for i in range(samples_per_server // 10):
# Simulate 10 pings at a time
ping_results = simulate_ping(server, 10)
timestamp = datetime.now()

for j, ping in enumerate(ping_results):
data.append({
'timestamp': timestamp,
'server': server,
'ping_ms': ping,
'packet_number': i * 10 + j + 1
})

# Simulate data collection delay
time.sleep(0.1)

return pd.DataFrame(data)

# Analyze network data
def analyze_network_data(df):
"""Perform analysis on network data"""
# Calculate summary statistics
summary = df.groupby('server')['ping_ms'].agg([
('avg_ping', 'mean'),
('min_ping', 'min'),
('max_ping', 'max'),
('std_ping', 'std'),
('packet_loss', lambda x: x.isna().mean() * 100) # Calculate packet loss percentage
]).round(2)

# Calculate jitter (variation in ping times)
jitter = df.groupby('server')['ping_ms'].diff().abs().groupby(df['server']).mean().round(2)
summary['jitter'] = jitter

# Identify servers with potential issues
summary['status'] = 'Good'
summary.loc[summary['packet_loss'] > 1, 'status'] = 'Warning'
summary.loc[summary['packet_loss'] > 5, 'status'] = 'Critical'
summary.loc[summary['avg_ping'] > 100, 'status'] = 'High Latency'

return summary

# Visualize network data
def visualize_network_data(df, summary):
"""Create visualizations for network data analysis"""
# Create a figure with subplots
fig = plt.figure(figsize=(20, 16))

# 1. Ping time series
ax1 = plt.subplot(2, 2, 1)
for server in df['server'].unique():
server_data = df[df['server'] == server]
ax1.plot(server_data['packet_number'], server_data['ping_ms'], 'o-', label=server, alpha=0.7)

ax1.set_title('Ping Times Over Time')
ax1.set_xlabel('Packet Number')
ax1.set_ylabel('Ping (ms)')
ax1.legend()
ax1.grid(True)

# 2. Box plot of ping times
ax2 = plt.subplot(2, 2, 2)
sns.boxplot(x='server', y='ping_ms', data=df, ax=ax2)
ax2.set_title('Distribution of Ping Times')
ax2.set_xlabel('Server')
ax2.set_ylabel('Ping (ms)')

# 3. Packet loss bar chart
ax3 = plt.subplot(2, 2, 3)
sns.barplot(x=summary.index, y='packet_loss', data=summary, ax=ax3)
ax3.set_title('Packet Loss by Server')
ax3.set_xlabel('Server')
ax3.set_ylabel('Packet Loss (%)')

# 4. Average ping with error bars
ax4 = plt.subplot(2, 2, 4)
sns.barplot(x=summary.index, y='avg_ping', data=summary, ax=ax4)

# Add error bars representing standard deviation
for i, server in enumerate(summary.index):
ax4.errorbar(i, summary.loc[server, 'avg_ping'],
yerr=summary.loc[server, 'std_ping'],
fmt='none', color='black', capsize=5)

ax4.set_title('Average Ping Time by Server')
ax4.set_xlabel('Server')
ax4.set_ylabel('Average Ping (ms)')

plt.tight_layout()
return fig

# Run the network analysis
def run_network_analysis():
"""Run complete network analysis workflow"""
# Define servers to test
servers = ['Local', 'Regional', 'International', 'Cloud-A', 'Cloud-B']

# Collect data
df = collect_network_data(servers, samples_per_server=60)

# Display raw data sample
print("\nRaw data sample:")
display(df.head())

# Analyze data
summary = analyze_network_data(df)

# Display summary
print("\nNetwork Analysis Summary:")
display(summary)

# Create visualizations
fig = visualize_network_data(df, summary)
plt.show()

# Additional analysis: correlation heatmap
plt.figure(figsize=(10, 8))
pivot_data = df.pivot_table(
index='packet_number',
columns='server',
values='ping_ms'
)

# Calculate and plot correlation matrix
corr = pivot_data.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation of Ping Times Between Servers')
plt.tight_layout()
plt.show()

return df, summary

# Run the analysis
network_data, network_summary = run_network_analysis()

Network Latency Analysis Explanation

The code above simulates and analyzes network communication by measuring ping times to different servers.

Here’s a breakdown of what the code does:

1. Data Simulation

  • The simulate_ping() function generates realistic ping times for different server types:

    • Local servers (close by, low latency)
    • Regional servers (medium distance)
    • International servers (long distance, high latency)
    • Two cloud providers with different characteristics
  • The simulation includes realistic network behaviors:

    • Base latency depending on geographic distance
    • Random jitter (small variations in ping time)
    • Occasional packet loss ($5$% chance)
    • Random network congestion events

2. Data Collection

  • The collect_network_data() function gathers ping data from each server type
  • For each server, it collects multiple ping samples ($60$ by default)
  • Each ping measurement is timestamped to track time-based patterns

3. Data Analysis

  • The analyze_network_data() function calculates important network metrics:
    • Average ping time (latency)
    • Minimum and maximum ping times
    • Standard deviation (consistency)
    • Packet loss percentage
    • Jitter (variation between consecutive pings)
    • Network status classification based on metrics

4. Visualization

The code creates four main visualizations to understand the network behavior:

  1. Time Series Plot: Shows how ping times change over time for each server
  2. Box Plot: Displays the distribution of ping times, highlighting outliers
  3. Packet Loss Chart: Compares packet loss percentages across servers
  4. Average Ping Chart: Shows average latency with standard deviation error bars

Additionally, it creates a correlation heatmap to see relationships between servers’ performance.

How to Use in Google Colab

To run this in Google Colab:

  1. Create a new notebook
  2. Copy and paste the code into a cell
  3. Run the cell to execute the entire analysis
  4. The code will display progress messages, raw data samples, and create visualizations

Key Insights from the Visualizations

The visualizations help identify:

  • Which servers have the lowest latency
  • Which servers experience packet loss
  • How consistent each connection is (jitter and standard deviation)
  • Whether server performance is correlated (suggesting shared network paths)
  • Time-based patterns like periodic congestion

This analysis would be valuable for network administrators, application developers planning for geographic distribution, or anyone troubleshooting network performance issues.

Output



Analysis of Projectile Motion

Trajectories at Different Launch Angles with Python Simulation

I’ll create a physics problem about projectile motion and solve it using $Python$, including visualization.

This is a common physics problem that combines kinematics equations with practical applications.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import matplotlib.pyplot as plt

def calculate_trajectory(v0, angle_deg, g=9.81):
"""
Calculate the trajectory of a projectile

Parameters:
v0 (float): Initial velocity in m/s
angle_deg (float): Launch angle in degrees
g (float): Acceleration due to gravity in m/s²
"""
# Convert angle to radians
angle = np.radians(angle_deg)

# Initial velocities
v0x = v0 * np.cos(angle)
v0y = v0 * np.sin(angle)

# Time of flight
t_flight = 2 * v0y / g

# Create time array
t = np.linspace(0, t_flight, 100)

# Calculate x and y positions
x = v0x * t
y = v0y * t - 0.5 * g * t**2

# Calculate maximum height and range
h_max = v0y**2 / (2 * g)
range_x = v0x * t_flight

return t, x, y, h_max, range_x

# Set initial conditions
v0 = 20 # Initial velocity (m/s)
angles = [30, 45, 60] # Launch angles (degrees)

# Create subplot
plt.figure(figsize=(12, 6))

# Plot trajectories for different angles
for angle in angles:
t, x, y, h_max, range_x = calculate_trajectory(v0, angle)
plt.plot(x, y, label=f'Angle = {angle}°')

# Print results
print(f"\nFor {angle}° launch angle:")
print(f"Maximum height: {h_max:.2f} meters")
print(f"Range: {range_x:.2f} meters")

plt.title('Projectile Motion for Different Launch Angles')
plt.xlabel('Distance (m)')
plt.ylabel('Height (m)')
plt.grid(True)
plt.legend()
plt.axis('equal')
plt.show()

Let me explain this physics problem and solution:

  1. Physical Problem:

    • We’re analyzing projectile motion with an initial velocity of $20$ $m/s$ at different launch angles ($30$°, $45$°, and $60$°)
    • We’re considering ideal conditions (no air resistance)
    • Gravity is set to $9.81$ m/s²
  2. Code Breakdown:

    • The calculate_trajectory function uses these physics equations:
      • x-position: $x = v₀x \times t$
      • y-position: $y = v₀y \times t - ½gt²$
      • Maximum height: $h_max = v₀y² / (2g)$
      • Time of flight: $t_flight = 2v₀y / g$
  3. Visualization:

    • The code creates a plot showing three different trajectories
    • Each trajectory represents a different launch angle
    • The x-axis shows horizontal distance
    • The y-axis shows height
  4. Key Physics Concepts:

    • At $45$°, you get the maximum range
    • Higher angles ($60$°) give greater maximum height but shorter range
    • Lower angles ($30$°) give lower maximum height but still good range

To run this in Google Colab, you’ll see:

  • A graph showing three parabolic trajectories
  • Printed calculations of maximum height and range for each angle
  • The axes are set to equal scale for proper visualization

Output

For 30° launch angle:
Maximum height: 5.10 meters
Range: 35.31 meters

For 45° launch angle:
Maximum height: 10.19 meters
Range: 40.77 meters

For 60° launch angle:
Maximum height: 15.29 meters
Range: 35.31 meters

Estimating the Habitable Zone of a Star

Problem

The habitable zone (HZ) of a star is the region around it where conditions might allow liquid water to exist on a planet’s surface.

The boundaries of the HZ can be approximated using the following formula:

$$
d_{\text{inner}} = \sqrt{\frac{L}{1.1}}
$$

$$
d_{\text{outer}} = \sqrt{\frac{L}{0.53}}
$$

  • $ L $ is the star’s luminosity relative to the Sun’s luminosity ($ L_{\odot} $)
  • $ d_{\text{inner}} $ is the inner boundary of the habitable zone (in AU)
  • $ d_{\text{outer}} $ is the outer boundary of the habitable zone (in AU)

We will calculate and visualize the habitable zone for stars with different luminosities.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import numpy as np
import matplotlib.pyplot as plt

# Define the luminosity range (in terms of the Sun's luminosity)
luminosities = np.linspace(0.1, 10, 100) # From 0.1 to 10 times the Sun's luminosity

# Compute the habitable zone boundaries
inner_hz = np.sqrt(luminosities / 1.1)
outer_hz = np.sqrt(luminosities / 0.53)

# Plot the habitable zone
plt.figure(figsize=(8, 5))
plt.plot(luminosities, inner_hz, label='Inner Boundary', color='red')
plt.plot(luminosities, outer_hz, label='Outer Boundary', color='blue')
plt.fill_between(luminosities, inner_hz, outer_hz, color='green', alpha=0.3, label='Habitable Zone')

# Labels and title
plt.xlabel("Luminosity (L/L_sun)")
plt.ylabel("Distance (AU)")
plt.title("Habitable Zone Boundaries for Different Star Luminosities")
plt.legend()
plt.grid()
plt.show()

Explanation

  1. We define a range of star luminosities from 0.1 to 10 times the Sun’s luminosity.
  2. Using the given formulas, we calculate the inner and outer boundaries of the habitable zone.
  3. We plot:
    • The inner boundary in red.
    • The outer boundary in blue.
    • The habitable zone as a green shaded area.
  4. The $x$-axis represents stellar luminosity, and the $y$-axis represents the distance in AU (astronomical units).

Graph Interpretation

  • Stars with higher luminosity have their habitable zones farther from the star.
  • Smaller stars have their habitable zones closer.
  • The habitable zone widens as luminosity increases.

This visualization helps astronomers estimate where potentially habitable exoplanets might exist around different types of stars. 🚀🔭

Implementation of Linear Regression

I’ll create a machine learning example using $Python$ to demonstrate linear regression on a simple dataset, then visualize the results.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Generate sample data
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print model coefficients
print(f"Intercept: {model.intercept_[0]:.2f}")
print(f"Slope: {model.coef_[0][0]:.2f}")

# Calculate R-squared score
r2_score = model.score(X_test, y_test)
print(f"R-squared score: {r2_score:.2f}")

# Visualization
plt.figure(figsize=(10, 6))
plt.scatter(X_train, y_train, color='blue', label='Training Data')
plt.scatter(X_test, y_test, color='green', label='Testing Data')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Regression Line')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression Example')
plt.legend()
plt.grid(True)
plt.show()

Let me explain this example step by step:

  1. Data Generation:

    • We create synthetic data with a known relationship: $y = 4 + 3x + noise$
    • $100$ data points are generated with random noise added
    • The true relationship has an intercept of $4$ and a slope of $3$
  2. Data Splitting:

    • The data is split into training ($80$%) and testing ($20$%) sets
    • This helps us evaluate how well our model generalizes to new data
  3. Model Training:

    • We use scikit-learn’s LinearRegression model
    • The model learns the relationship between $X$ and $y$ from the training data
  4. Results Analysis:

    • The model finds the intercept and slope that best fit the data
    • R-squared score shows how well the model fits the test data
    • A score close to $1.0$ indicates a good fit
  5. Visualization:

    • Blue dots: Training data points
    • Green dots: Testing data points
    • Red line: The predicted relationship found by the model
    • The grid helps see how well the line fits the data

When you run this code, you’ll see:

  • The model parameters (intercept and slope) should be close to the true values ($4$ and $3$)
  • The R-squared score should be relatively high (> $0.8$)
  • The visualization will show how well the linear regression line fits the data points
Intercept: 4.14
Slope: 2.80
R-squared score: 0.81

This is a basic example of supervised learning where we:

  1. Generate data with a known pattern
  2. Train a model to discover that pattern
  3. Evaluate how well the model learned
  4. Visualize the results

Marketing Analytics:RFM Customer Segmentation with Python Visualization

I’ll create a marketing analytics example using $Python$ to analyze customer segmentation based on purchase behavior and create visualizations.

We’ll use the $RFM$ (Recency, Frequency, Monetary) analysis, which is a common marketing technique.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

# Generate sample customer transaction data
np.random.seed(42)

# Create sample data
n_customers = 1000
customer_ids = range(1, n_customers + 1)

# Generate random dates within the last year
end_date = datetime(2025, 2, 21)
start_date = end_date - timedelta(days=365)
dates = [start_date + timedelta(days=np.random.randint(0, 365)) for _ in range(n_customers)]

# Generate random transaction amounts and frequencies
frequencies = np.random.poisson(lam=3, size=n_customers)
monetary_values = np.random.normal(100, 30, n_customers)
monetary_values = [max(20, val) for val in monetary_values] # Ensure minimum transaction value

# Create DataFrame
data = pd.DataFrame({
'customer_id': customer_ids,
'last_purchase_date': dates,
'purchase_frequency': frequencies,
'total_monetary_value': monetary_values
})

# Calculate RFM scores
current_date = end_date
data['recency'] = (current_date - pd.to_datetime(data['last_purchase_date'])).dt.days

# Function to assign RFM scores
def assign_score(value, quartiles):
if value <= quartiles[0.25]:
return 4
elif value <= quartiles[0.5]:
return 3
elif value <= quartiles[0.75]:
return 2
else:
return 1

# Calculate quartiles and assign scores
r_quartiles = data['recency'].quantile([0.25, 0.5, 0.75])
f_quartiles = data['purchase_frequency'].quantile([0.25, 0.5, 0.75])
m_quartiles = data['total_monetary_value'].quantile([0.25, 0.5, 0.75])

data['R_score'] = data['recency'].apply(lambda x: assign_score(x, r_quartiles))
data['F_score'] = data['purchase_frequency'].apply(lambda x: 5 - assign_score(x, f_quartiles))
data['M_score'] = data['total_monetary_value'].apply(lambda x: 5 - assign_score(x, m_quartiles))

# Calculate final RFM score
data['RFM_score'] = data['R_score'] * 100 + data['F_score'] * 10 + data['M_score']

# Segment customers based on RFM score
def segment_customers(row):
if row['RFM_score'] >= 444:
return 'Best Customers'
elif row['RFM_score'] >= 334:
return 'Loyal Customers'
elif row['RFM_score'] >= 224:
return 'Average Customers'
else:
return 'Lost Customers'

data['customer_segment'] = data.apply(segment_customers, axis=1)

# Create visualizations
plt.figure(figsize=(15, 10))

# Plot 1: Customer Segments Distribution
plt.subplot(2, 2, 1)
segment_counts = data['customer_segment'].value_counts()
plt.pie(segment_counts, labels=segment_counts.index, autopct='%1.1f%%')
plt.title('Customer Segments Distribution')

# Plot 2: Average Monetary Value by Segment
plt.subplot(2, 2, 2)
avg_monetary = data.groupby('customer_segment')['total_monetary_value'].mean()
avg_monetary.plot(kind='bar')
plt.title('Average Monetary Value by Segment')
plt.ylabel('Average Spend ($)')
plt.xticks(rotation=45)

# Plot 3: Recency vs Frequency
plt.subplot(2, 2, 3)
plt.scatter(data['recency'], data['purchase_frequency'],
c=data['total_monetary_value'], cmap='viridis')
plt.colorbar(label='Monetary Value ($)')
plt.xlabel('Recency (days)')
plt.ylabel('Purchase Frequency')
plt.title('Recency vs Frequency (color = Monetary Value)')

# Plot 4: RFM Score Distribution
plt.subplot(2, 2, 4)
plt.hist(data['RFM_score'], bins=20)
plt.xlabel('RFM Score')
plt.ylabel('Number of Customers')
plt.title('RFM Score Distribution')

plt.tight_layout()
plt.show()

# Print summary statistics
print("\nCustomer Segment Summary:")
print(data.groupby('customer_segment').agg({
'customer_id': 'count',
'total_monetary_value': 'mean',
'purchase_frequency': 'mean',
'recency': 'mean'
}).round(2))

Let me explain the code and analysis:

  1. Data Generation:

    • Created sample data for $1000$ customers
    • Generated random purchase dates within the last year
    • Simulated purchase frequencies using Poisson distribution
    • Generated monetary values using normal distribution
  2. RFM Analysis:

    • Recency: Days since last purchase
    • Frequency: Number of purchases
    • Monetary: Total amount spent
    • Scored each component from $1$-$4$ based on quartiles
    • Combined scores to create RFM_score
  3. Customer Segmentation:

    • Best Customers: Highest RFM scores
    • Loyal Customers: Good RFM scores
    • Average Customers: Medium RFM scores
    • Lost Customers: Low RFM scores
  4. Visualizations:

    • Pie chart showing distribution of customer segments
    • Bar chart showing average spend by segment
    • Scatter plot of recency vs frequency
    • Histogram of RFM scores

Output

Customer Segment Summary:
                   customer_id  total_monetary_value  purchase_frequency  \
customer_segment                                                           
Average Customers          310                101.87                3.07   
Best Customers              12                143.16                5.58   
Lost Customers             387                 97.72                2.52   
Loyal Customers            291                 97.65                3.38   

                   recency  
customer_segment            
Average Customers   172.13  
Best Customers       40.75  
Lost Customers      286.34  
Loyal Customers      64.54  

The analysis provides several key insights:

  1. Customer segment distribution shows the proportion of customers in each category
  2. Average monetary value by segment helps identify high-value customer groups
  3. Recency vs Frequency plot reveals customer purchase patterns
  4. RFM Score distribution shows the overall spread of customer values

Student Performance Analysis

Correlation Between Study Hours and Test Scores

I’ll create a data science example using $Python$ that demonstrates data analysis and visualization.

Let’s analyze a dataset of student performance based on study hours and create visualizations to understand the relationship.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Generate sample data
np.random.seed(42)
study_hours = np.random.normal(7, 2, 50) # Mean of 7 hours, std of 2, 50 students
base_score = study_hours * 8 # Base relationship
noise = np.random.normal(0, 5, 50) # Add some random variation
test_scores = base_score + noise

# Create DataFrame
data = pd.DataFrame({
'study_hours': study_hours,
'test_scores': test_scores
})

# Clean the data (remove any negative scores)
data['test_scores'] = data['test_scores'].clip(0, 100)

# Perform statistical analysis
mean_study = data['study_hours'].mean()
mean_score = data['test_scores'].mean()
correlation = data['study_hours'].corr(data['test_scores'])

# Create linear regression model
X = data['study_hours'].values.reshape(-1, 1)
y = data['test_scores'].values
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

# Calculate R-squared and RMSE
r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))

# Create visualization
plt.figure(figsize=(12, 6))

# Subplot 1: Scatter plot with regression line
plt.subplot(1, 2, 1)
plt.scatter(study_hours, test_scores, alpha=0.5)
plt.plot(X, y_pred, color='red', linewidth=2)
plt.xlabel('Study Hours')
plt.ylabel('Test Scores')
plt.title('Study Hours vs Test Scores')

# Subplot 2: Distribution of study hours
plt.subplot(1, 2, 2)
plt.hist(study_hours, bins=15, edgecolor='black')
plt.xlabel('Study Hours')
plt.ylabel('Frequency')
plt.title('Distribution of Study Hours')

plt.tight_layout()
plt.show()

# Print analysis results
print("\nAnalysis Results:")
print(f"Average study hours: {mean_study:.2f}")
print(f"Average test score: {mean_score:.2f}")
print(f"Correlation coefficient: {correlation:.2f}")
print(f"R-squared value: {r2:.2f}")
print(f"Root Mean Square Error: {rmse:.2f}")
print(f"For each additional hour of study, score increases by: {model.coef_[0]:.2f} points")

Let me explain this data science example step by step:

  1. Data Generation and Setup:

    • We create synthetic data for $50$ students
    • Each student has study hours (normally distributed around $7$ hours)
    • Test scores are generated based on study hours with some random noise
    • We ensure scores stay within $0$-$100$ range
  2. Analysis Performed:

    • Basic statistics (mean study hours and test scores)
    • Correlation between study hours and test scores
    • Linear regression to predict test scores from study hours
    • Model performance metrics ($R$-$squared$ and $RMSE$)
  3. Visualizations:

    • Left plot: Scatter plot showing relationship between study hours and test scores
      • Blue dots represent individual students
      • Red line shows the linear regression prediction
    • Right plot: Histogram showing distribution of study hours
      • Shows how many students study for different amounts of time
  4. Key Findings (will vary with random data):

    • There’s a positive correlation between study hours and test scores
    • The linear regression line shows the general trend
    • The $R$-$squared$ value indicates how well study hours predict test scores
    • The coefficient shows how many points scores increase per additional study hour

This example demonstrates several key data science concepts:

  • Data cleaning and preparation
  • Statistical analysis
  • Linear regression modeling
  • Data visualization
  • Model evaluation

Output

Analysis Results:
Average study hours: 6.55
Average test score: 52.48
Correlation coefficient: 0.96
R-squared value: 0.93
Root Mean Square Error: 4.30
For each additional hour of study, score increases by: 8.26 points

Environmental Data Analysis:CO2 Emissions Impact on Global Temperature (1970-2020)

I’ll create an environmental science example analyzing the relationship between CO2 emissions and global temperature changes, then visualize the data.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Generate sample data (simplified model)
years = np.arange(1970, 2021)
# Simulate CO2 emissions with increasing trend and some random variation
co2_emissions = 15000 + (years - 1970) * 300 + np.random.normal(0, 500, len(years))
# Simulate temperature anomalies with correlation to CO2
base_temp = 14.0 # base temperature in Celsius
temp_anomalies = (co2_emissions - co2_emissions[0]) * 0.0001 + np.random.normal(0, 0.1, len(years))

# Create DataFrame
climate_data = pd.DataFrame({
'Year': years,
'CO2_Emissions': co2_emissions,
'Temperature_Anomaly': temp_anomalies
})

# Calculate correlation coefficient
correlation = stats.pearsonr(climate_data['CO2_Emissions'], climate_data['Temperature_Anomaly'])

# Calculate simple linear regression
slope, intercept = np.polyfit(climate_data['CO2_Emissions'], climate_data['Temperature_Anomaly'], 1)
regression_line = slope * climate_data['CO2_Emissions'] + intercept

# Create visualization
plt.figure(figsize=(12, 8))

# Plot 1: CO2 Emissions over time
plt.subplot(2, 1, 1)
plt.plot(climate_data['Year'], climate_data['CO2_Emissions'], 'b-', label='CO2 Emissions')
plt.title('Annual CO2 Emissions (1970-2020)')
plt.xlabel('Year')
plt.ylabel('CO2 Emissions (Million Metric Tons)')
plt.grid(True)
plt.legend()

# Plot 2: Temperature Anomaly vs CO2 Emissions with regression line
plt.subplot(2, 1, 2)
plt.scatter(climate_data['CO2_Emissions'], climate_data['Temperature_Anomaly'],
alpha=0.5, label='Temperature vs CO2')
plt.plot(climate_data['CO2_Emissions'], regression_line, 'r-',
label=f'Regression Line (slope: {slope:.2e})')
plt.title('Temperature Anomaly vs CO2 Emissions')
plt.xlabel('CO2 Emissions (Million Metric Tons)')
plt.ylabel('Temperature Anomaly (°C)')
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.show()

# Print analysis results
print(f"Correlation coefficient: {correlation[0]:.3f}")
print(f"P-value: {correlation[1]:.3e}")
print(f"Regression slope: {slope:.2e} °C/MtCO2")

Let me explain this environmental science example:

  1. Problem Statement:
    We’re analyzing the relationship between CO2 emissions and global temperature changes over a $50$-year period ($1970$-$2020$).

  2. Data Simulation:

  • We create synthetic data that mimics real-world patterns
  • CO2 emissions show an increasing trend with random variations
  • Temperature anomalies are modeled with a correlation to CO2 emissions
  1. Analysis Components:
  • Correlation analysis between CO2 and temperature
  • Linear regression to quantify the relationship
  • Visualization of trends and relationships
  1. Visualization:
    The code creates two plots:
  • Top plot: Shows CO2 emissions trend over time
  • Bottom plot: Displays the relationship between temperature anomalies and CO2 emissions, including a regression line
  1. Key Features:
  • Error handling with random variations
  • Statistical analysis (correlation coefficient and $p$-$value$)
  • Clear visualization with proper labeling and grid lines

When you run this code, you’ll see:

  • A clear upward trend in CO2 emissions over time
  • A positive correlation between CO2 emissions and temperature anomalies
  • Statistical measures of the relationship strength

Correlation coefficient: 0.981
P-value: 2.097e-36
Regression slope: 1.00e-04 °C/MtCO2

Sprint Performance Analysis

I’ll create a sports science example analyzing sprint performance data and demonstrate how to process and visualize it using $Python$.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Generate sample data for 10 athletes over 5 training sessions
np.random.seed(42)
n_athletes = 10
n_sessions = 5

# Create sprint times data (100m sprint)
athlete_names = [f"Athlete_{i+1}" for i in range(n_athletes)]
sessions = [f"Session_{i+1}" for i in range(n_sessions)]

# Generate realistic sprint times (between 10.5 and 11.5 seconds)
# With slight improvement trend over sessions
base_times = np.random.uniform(10.5, 11.5, n_athletes)
sprint_times = np.array([
[time - (session * 0.05) + np.random.normal(0, 0.1)
for session in range(n_sessions)]
for time in base_times
])

# Create DataFrame
df = pd.DataFrame(sprint_times,
index=athlete_names,
columns=sessions)

# Calculate statistics
session_means = df.mean()
session_stds = df.std()
improvement = df['Session_1'] - df['Session_5'] # Total improvement

# Perform statistical analysis
ttest_result = stats.ttest_rel(df['Session_1'], df['Session_5'])

# Create visualizations
plt.figure(figsize=(15, 10))

# Plot 1: Sprint times over sessions
plt.subplot(2, 2, 1)
for athlete in df.index:
plt.plot(sessions, df.loc[athlete], marker='o', label=athlete)
plt.title('Sprint Times Progression')
plt.xlabel('Training Session')
plt.ylabel('Time (seconds)')
plt.grid(True)

# Plot 2: Box plot of sprint times
plt.subplot(2, 2, 2)
df.boxplot()
plt.title('Distribution of Sprint Times')
plt.xlabel('Training Session')
plt.ylabel('Time (seconds)')

# Plot 3: Mean times with error bars
plt.subplot(2, 2, 3)
plt.errorbar(sessions, session_means, yerr=session_stds,
marker='o', capsize=5, capthick=2)
plt.title('Mean Sprint Times with Standard Deviation')
plt.xlabel('Training Session')
plt.ylabel('Time (seconds)')
plt.grid(True)

# Plot 4: Individual improvements
plt.subplot(2, 2, 4)
improvement.sort_values().plot(kind='bar')
plt.title('Total Improvement by Athlete')
plt.xlabel('Athlete')
plt.ylabel('Time Improvement (seconds)')
plt.grid(True)

plt.tight_layout()
plt.show()

# Print statistical summary
print("\nStatistical Summary:")
print("===================")
print(f"Mean initial time: {df['Session_1'].mean():.2f} seconds")
print(f"Mean final time: {df['Session_5'].mean():.2f} seconds")
print(f"Average improvement: {improvement.mean():.2f} seconds")
print(f"\nPaired t-test results:")
print(f"t-statistic: {ttest_result.statistic:.3f}")
print(f"p-value: {ttest_result.pvalue:.3f}")

Let me explain this sports science example that analyzes sprint performance:

  1. Data Generation and Setup:

    • We simulate data for $10$ athletes over $5$ training sessions
    • Sprint times are generated between $10.5$ and $11.5$ seconds (realistic $100$m sprint times)
    • We include a slight improvement trend and random variation
  2. Analysis Components:

    • Tracking individual athlete progression
    • Calculating statistical measures (mean, standard deviation)
    • Measuring total improvement
    • Performing a paired $t$-$test$ to check if improvement is statistically significant
  3. Visualizations:

The code creates four different plots:

  • Line plot showing each athlete’s progression
  • Box plot showing the distribution of times in each session
  • Mean times with error bars showing variation
  • Bar chart showing total improvement by athlete
  1. Statistical Output:
Statistical Summary:
===================
Mean initial time: 10.95 seconds
Mean final time: 10.81 seconds
Average improvement: 0.14 seconds

Paired t-test results:
t-statistic: 3.765
p-value: 0.004
  • Calculates mean initial and final times
  • Shows average improvement across all athletes
  • Performs statistical significance testing

The visualizations help coaches and athletes:

  • Track individual and group progress
  • Identify outliers or unusual patterns
  • Understand the variation in performance
  • Quantify improvements over time

This analysis could be extended to include:

  • More advanced metrics like acceleration phases
  • Fatigue analysis
  • Correlation with other training parameters
  • Prediction of future performance

Modeling Tumor Growth Using the Gompertz Model

Let’s consider a medical example where we analyze the growth of a tumor over time.

We’ll use $Python$ to model the tumor growth, solve the differential equation, and visualize the results.

Problem Statement

The growth of a tumor can often be modeled using the Gompertz growth model, which is given by the differential equation:

$$
\frac{dV}{dt} = a V \ln\left(\frac{K}{V}\right)
$$

  • $ V(t) $ is the volume of the tumor at time $ t $.
  • $ a $ is the growth rate of the tumor.
  • $ K $ is the carrying capacity, i.e., the maximum volume the tumor can reach.

Python Implementation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import numpy as np
from scipy.integrate import solve_ivp
import matplotlib.pyplot as plt

# Define the Gompertz growth model
def gompertz_growth(t, V, a, K):
return a * V * np.log(K / V)

# Parameters
a = 0.1 # Growth rate
K = 1000 # Carrying capacity (maximum tumor volume)
V0 = 10 # Initial tumor volume

# Time span for the solution
t_span = (0, 50) # From t=0 to t=50
t_eval = np.linspace(t_span[0], t_span[1], 300) # Points where the solution is evaluated

# Solve the differential equation
sol = solve_ivp(gompertz_growth, t_span, [V0], args=(a, K), t_eval=t_eval, method='RK45')

# Extract the solution
V = sol.y[0]

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(sol.t, V, label='Tumor Volume (V)', color='b')
plt.xlabel('Time (t)')
plt.ylabel('Tumor Volume (V)')
plt.title('Tumor Growth Over Time (Gompertz Model)')
plt.legend()
plt.grid(True)
plt.show()

Explanation

  1. Gompertz Growth Model:

The differential equation $\frac{dV}{dt} = a V \ln\left(\frac{K}{V}\right)$ is implemented in the gompertz_growth function.
This function takes the current time t, the current tumor volume V, and the parameters a and K to compute the rate of change of the tumor volume.

  1. Parameters:

    • a = 0.1: This is the growth rate of the tumor.
    • K = 1000: This is the carrying capacity, the maximum volume the tumor can reach.
    • V0 = 10: This is the initial volume of the tumor at time t=0.
  2. Solving the Differential Equation:

We use solve_ivp from scipy.integrate to solve the differential equation.
The method RK45 is a Runge-Kutta method of order $5(4)$, which is suitable for solving non-stiff differential equations.

  1. Plotting the Results:

The tumor volume V is plotted against time t.
The plot shows how the tumor volume grows over time, approaching the carrying capacity K.

Graph Interpretation

  • The tumor volume starts at V0 = 10 and grows rapidly at first.
  • As the tumor volume approaches the carrying capacity K = 1000, the growth rate slows down.
  • The tumor volume asymptotically approaches the carrying capacity, but never exceeds it.

This model is useful in oncology for understanding tumor growth dynamics and planning treatment strategies.

Output

The output will be a graph showing the tumor volume V over time t.

The curve will start at V0 = 10 and gradually approach K = 1000, illustrating the Gompertz growth behavior.

Population-Resource Dynamics

Let’s solve an environmental science problem using $Python$.

We will model Population Dynamics and Resource Consumption, a crucial topic in environmental science to understand sustainability and ecological balance.


Problem Statement

We want to simulate the interactions between a population and a renewable resource:

  • A population consumes a natural resource for survival.
  • The resource regenerates over time but is depleted by consumption.
  • If consumption exceeds regeneration, the resource may collapse, leading to population decline.
  • Conversely, if the resource is abundant, the population may grow.

We will model this using the Lotka-Volterra equations, which are typically used for predator-prey dynamics but can also describe resource-consumer interactions.


Methodology

  1. Population and Resource Dynamics:
    • Population growth is proportional to available resources.
    • Resource regeneration follows logistic growth.
  2. Differential Equations:
    • $ \frac{dR}{dt} = r \cdot R \left(1 - \frac{R}{K}\right) - c \cdot P \cdot R $
    • $ \frac{dP}{dt} = e \cdot c \cdot P \cdot R - m \cdot P $
      • $ R $: Resource quantity
      • $ P $: Population size
      • $ r $: Resource growth rate
      • $ K $: Carrying capacity of the resource
      • $ c $: Consumption rate
      • $ e $: Efficiency of resource to population growth
      • $ m $: Mortality rate of the population
  3. Simulation:
    • Use numerical integration to simulate population-resource dynamics over time.
  4. Visualization:
    • Plot resource and population sizes over time.
    • Phase plot to observe cyclical behavior.

Required Libraries

We’ll use the following libraries:

  • numpy: For numerical calculations.
  • scipy: To solve differential equations.
  • matplotlib: To visualize the results.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import numpy as np
from scipy.integrate import solve_ivp
import matplotlib.pyplot as plt

# Parameters
r = 0.5 # Resource growth rate
K = 100 # Resource carrying capacity
c = 0.02 # Consumption rate
e = 0.1 # Efficiency of resource use
m = 0.1 # Mortality rate of population

# Differential equations
def model(t, y):
R, P = y
dRdt = r * R * (1 - R / K) - c * P * R
dPdt = e * c * P * R - m * P
return [dRdt, dPdt]

# Initial conditions
R0 = 80 # Initial resource quantity
P0 = 10 # Initial population size
y0 = [R0, P0]

# Time span
t_span = (0, 200)
t_eval = np.linspace(t_span[0], t_span[1], 1000)

# Solve differential equations
solution = solve_ivp(model, t_span, y0, t_eval=t_eval)

# Extract results
time = solution.t
R = solution.y[0]
P = solution.y[1]

# Plot Resource and Population over Time
plt.figure(figsize=(10, 5))
plt.plot(time, R, label='Resource', color='green')
plt.plot(time, P, label='Population', color='blue')
plt.title('Population and Resource Dynamics Over Time')
plt.xlabel('Time')
plt.ylabel('Quantity')
plt.legend()
plt.grid(True)
plt.show()

# Phase Plot (Resource vs Population)
plt.figure(figsize=(6, 6))
plt.plot(R, P, color='purple')
plt.title('Phase Plot: Resource vs Population')
plt.xlabel('Resource Quantity')
plt.ylabel('Population Size')
plt.grid(True)
plt.show()

Explanation

  1. Differential Equations:
    • The resource grows logistically and is consumed by the population.
    • Population growth is dependent on resource availability.
  2. Numerical Integration:
    We use scipy.integrate.solve_ivp() to numerically solve the differential equations over a specified time span.
  3. Visualization:
    • Time Series Plot: Shows resource and population dynamics over time.
    • Phase Plot: Shows the relationship between resource and population size, revealing cyclical interactions.

Analysis of Results

  1. Time Series Plot:

    • Oscillations in both resource and population sizes indicate cyclical predator-prey-like dynamics.
    • When the resource is abundant, the population grows.
    • As the population consumes the resource, the resource decreases, leading to a population decline.
    • This allows the resource to regenerate, starting the cycle again.
  2. Phase Plot:

    • The cyclical pattern represents the feedback loop between resource availability and population growth.
    • The trajectory suggests a stable limit cycle, indicating sustainable oscillations rather than collapse or unbounded growth.

Environmental Implications

This model illustrates the delicate balance between resource consumption and population growth, relevant for:

  • Sustainability Analysis: Ensuring renewable resources are not overexploited.
  • Conservation Biology: Managing species populations and their habitats.
  • Agricultural Planning: Balancing crop yields with environmental impacts.