Ha Khanh Nguyen (hknguyen)
random.choices()
allows us to sample values from a "population" with replacement.import random
x = random.choices([1, 2, 3, 4, 5], k=1)
x
[5]
type(x)
list
values = random.choices([1, 2, 3, 4, 5], k=5)
values
[3, 1, 5, 2, 1]
sample = [10, 8, 16, 24, 0, 8, 0, 0, 16, 12]
new_sample = random.choices(sample, k=len(sample))
new_sample
[8, 0, 16, 8, 8, 16, 0, 24, 8, 8]
# resample from the original sample
new_sample = random.choices(sample, k=len(sample))
new_sample
[24, 16, 8, 8, 16, 8, 8, 8, 8, 0]
import numpy as np
np.mean(new_sample)
10.4
# do this for 10000 times
sample_means = []
for i in range(10000):
# resample
new_sample = random.choices(sample, k=len(sample))
x_bar = np.mean(new_sample)
sample_means.append(x_bar)
import matplotlib.pyplot as plt
plt.hist(sample_means, color='darkorange')
plt.grid(color='lightgrey', linewidth=0.5)
plt.show()
np.quantile(sample_means, q=0.025)
4.8
np.quantile(sample_means, q=0.975)
14.0
plt.hist(sample_means, color='darkorange')
plt.grid(color='lightgrey', linewidth=0.5)
plt.axvline(x=np.quantile(sample_means, q=0.025))
plt.axvline(x=np.quantile(sample_means, q=0.975))
plt.show()
faithful
dataset that we worked with at the beginning of the semester?import pandas as pd
faithful = pd.read_csv('https://stat107.hknguyen.org/files/datasets/faithful.csv')
faithful
eruptions | waiting | |
---|---|---|
0 | 3.600 | 79 |
1 | 1.800 | 54 |
2 | 3.333 | 74 |
3 | 2.283 | 62 |
4 | 4.533 | 85 |
... | ... | ... |
267 | 4.117 | 81 |
268 | 2.150 | 46 |
269 | 4.417 | 90 |
270 | 1.817 | 46 |
271 | 4.467 | 74 |
272 rows × 2 columns
waiting = faithful['waiting']
waiting
0 79 1 54 2 74 3 62 4 85 .. 267 81 268 46 269 90 270 46 271 74 Name: waiting, Length: 272, dtype: int64
# resample from waiting
new_sample = random.choices(waiting, k=len(waiting))
# compute the proportion of waiting time exceeding 60 mins (1 hour)
np.mean(np.array(new_sample) > 60)
0.7757352941176471
sample_props = []
for i in range(10000):
new_sample = random.choices(waiting, k=len(waiting))
p_hat = np.mean(np.array(new_sample) > 60)
sample_props.append(p_hat)
plt.hist(sample_props, color='darkorange')
plt.grid(color='lightgrey', linewidth=0.5)
plt.show()
# lower bound
np.quantile(sample_props, q=0.005)
0.6213235294117647
# upper bound
np.quantile(sample_props, q=1-0.005)
0.7610294117647058
plt.hist(sample_props, color='darkorange')
plt.grid(color='lightgrey', linewidth=0.5)
plt.axvline(x=np.quantile(sample_props, q=0.005))
plt.axvline(x=np.quantile(sample_props, q=0.995))
plt.show()
tweets = pd.read_csv("https://stat107.hknguyen.org/files/datasets/trump_tweets.csv", encoding="ISO-8859-1")
tweets
text | favorited | favoriteCount | replyToSN | created | truncated | replyToSID | id | replyToUID | statusSource | screenName | retweetCount | isRetweet | retweeted | longitude | latitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | My economic policy speech will be carried live... | False | 9214 | NaN | 2016-08-08 15:20:44 | False | NaN | 762669882571980801 | NaN | <a href="http://twitter.com/download/android" ... | realDonaldTrump | 3107 | False | False | NaN | NaN |
1 | Join me in Fayetteville, North Carolina tomorr... | False | 6981 | NaN | 2016-08-08 13:28:20 | False | NaN | 762641595439190016 | NaN | <a href="http://twitter.com/download/iphone" r... | realDonaldTrump | 2390 | False | False | NaN | NaN |
2 | #ICYMI: "Will Media Apologize to Trump?" https... | False | 15724 | NaN | 2016-08-08 00:05:54 | False | NaN | 762439658911338496 | NaN | <a href="http://twitter.com/download/iphone" r... | realDonaldTrump | 6691 | False | False | NaN | NaN |
3 | Michael Morell, the lightweight former Acting ... | False | 19837 | NaN | 2016-08-07 23:09:08 | False | NaN | 762425371874557952 | NaN | <a href="http://twitter.com/download/android" ... | realDonaldTrump | 6402 | False | False | NaN | NaN |
4 | The media is going crazy. They totally distort... | False | 34051 | NaN | 2016-08-07 21:31:46 | False | NaN | 762400869858115588 | NaN | <a href="http://twitter.com/download/android" ... | realDonaldTrump | 11717 | False | False | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1507 | "@constant4change: Trump tops Dem candidates o... | False | 2590 | NaN | 2015-12-20 08:21:23 | False | NaN | 678490367285678081 | NaN | <a href="http://twitter.com/download/android" ... | realDonaldTrump | 990 | False | False | NaN | NaN |
1508 | "@autumnandews08: @realDonaldTrump @jonkarl Hi... | False | 3550 | NaN | 2015-12-20 05:25:13 | False | NaN | 678446032599040001 | NaN | <a href="http://twitter.com/download/android" ... | realDonaldTrump | 1146 | False | False | NaN | NaN |
1509 | "@DomineekSmith: @realDonaldTrump is the best ... | False | 3719 | NaN | 2015-12-20 05:11:04 | False | NaN | 678442470720577537 | NaN | <a href="http://twitter.com/download/android" ... | realDonaldTrump | 1151 | False | False | NaN | NaN |
1510 | Another great accolade for @TrumpGolf. Highly ... | False | 2304 | NaN | 2015-12-14 21:11:12 | False | NaN | 676509769562251264 | NaN | <a href="http://twitter.com/download/iphone" r... | realDonaldTrump | 713 | False | False | NaN | NaN |
1511 | Record of Health: https://t.co/ZDDDawwYVl\n#Ma... | False | 2599 | NaN | 2015-12-14 20:09:15 | False | NaN | 676494179216805888 | NaN | <a href="http://twitter.com/download/iphone" r... | realDonaldTrump | 952 | False | False | NaN | NaN |
1512 rows × 16 columns
tweet_lengths = tweets['text'].str.len()
tweet_lengths
0 67 1 114 2 64 3 134 4 135 ... 1507 97 1508 126 1509 105 1510 133 1511 75 Name: text, Length: 1512, dtype: int64
plt.hist(tweet_lengths, color='darkgrey')
plt.grid(color='lightgrey', linewidth=0.5)
plt.show()
# resample from the original sample
new_sample = random.choices(tweet_lengths, k=len(tweet_lengths))
# median
np.median(new_sample)
126.0
sample_meds = []
for i in range(5000):
new_sample = random.choices(tweet_lengths, k=len(tweet_lengths))
med = np.median(new_sample)
sample_meds.append(med)
plt.hist(sample_meds, color='darkorange')
plt.grid(color='lightgrey', linewidth=0.5)
plt.show()
np.quantile(sample_meds, q=(1-0.87)/2)
125.5
np.quantile(sample_meds, q=1-(1-0.87)/2)
129.0
plt.hist(sample_meds, color='darkorange')
plt.grid(color='lightgrey', linewidth=0.5)
plt.axvline(x=np.quantile(sample_meds, q=(1-0.87)/2))
plt.axvline(x=np.quantile(sample_meds, q=1-(1-0.87)/2))
plt.show()