On to ch3
parent
8af7b3915c
commit
f8c316559e
@ -0,0 +1,19 @@
|
||||
# Chapter 1
|
||||
|
||||
## Take aways
|
||||
|
||||
This chapter uses a fictitious first day as a Data Scientist to illustrate some social science research. In particular it illustrates the need to ask good questions of the data you have.
|
||||
|
||||
For example. We had data on inter office friendships and professional interests from there we could determine:
|
||||
|
||||
- Average number of friends
|
||||
- Total count of inter office friendships
|
||||
- A suggested list of people a person could befriend based on mutual friend ships (Friends of a friend)
|
||||
- A suggested list of people a person could befriend based on mutual interest.
|
||||
|
||||
|
||||
These examples continued with salary data, tenure, and paid accounts
|
||||
|
||||
- At first we could explore with a visualization the relationship of salary and tenure. Resulting in us determining that individuals with longer tenures tend to earn more.
|
||||
- Since tenure was widely distributed it was necessary to bucket individuals by tenure to get an average salary for a tenure range.
|
||||
- This lead to the insight that data scientists with more than 5 years of experience earn 65% more than a data scientist with two years or less experience.
|
@ -0,0 +1,233 @@
|
||||
from collections import Counter, defaultdict
|
||||
from typing import List, Dict, Tuple, Any
|
||||
|
||||
USERS = [
|
||||
{"id": 0, "name": "Hero"},
|
||||
{"id": 1, "name": "Dunn"},
|
||||
{"id": 2, "name": "Sue"},
|
||||
{"id": 3, "name": "Chi"},
|
||||
{"id": 4, "name": "Thor"},
|
||||
{"id": 5, "name": "Clive"},
|
||||
{"id": 6, "name": "Hicks"},
|
||||
{"id": 7, "name": "Devin"},
|
||||
{"id": 8, "name": "Kate"},
|
||||
{"id": 9, "name": "Klein"},
|
||||
]
|
||||
|
||||
FRIENDSHIPS = [
|
||||
(0, 1),
|
||||
(0, 2),
|
||||
(1, 2),
|
||||
(1, 3),
|
||||
(2, 3),
|
||||
(3, 4),
|
||||
(4, 5),
|
||||
(5, 6),
|
||||
(5, 7),
|
||||
(6, 8),
|
||||
(7, 8),
|
||||
(8, 9),
|
||||
]
|
||||
|
||||
INTERETS = [
|
||||
(0, "Hadoop"),
|
||||
(0, "Big Data"),
|
||||
(0, "HBase"),
|
||||
(0, "Java"),
|
||||
(0, "Spark"),
|
||||
(0, "Storm"),
|
||||
(0, "Cassandra"),
|
||||
(1, "NoSQL"),
|
||||
(1, "MongoDB"),
|
||||
(1, "Cassandra"),
|
||||
(1, "HBase"),
|
||||
(1, "Postgres"),
|
||||
(2, "Python"),
|
||||
(2, "scikit-learn"),
|
||||
(2, "scipy"),
|
||||
(2, "numpy"),
|
||||
(2, "statsmodels"),
|
||||
(2, "pandas"),
|
||||
(3, "R"),
|
||||
(3, "Python"),
|
||||
(3, "statistics"),
|
||||
(3, "regression"),
|
||||
(3, "probability"),
|
||||
(4, "machine learning"),
|
||||
(4, "regression"),
|
||||
(4, "decision trees"),
|
||||
(4, "libsvm"),
|
||||
(5, "Python"),
|
||||
(5, "R"),
|
||||
(5, "Java"),
|
||||
(5, "C++"),
|
||||
(5, "Haskell"),
|
||||
(5, "programming languages"),
|
||||
(6, "statistics"),
|
||||
(6, "probability"),
|
||||
(6, "mathematics"),
|
||||
(6, "theory"),
|
||||
(7, "machine learning"),
|
||||
(7, "scikit-learn"),
|
||||
(7, "Mahout"),
|
||||
(7, "neural networks"),
|
||||
(8, "neural networks"),
|
||||
(8, "deep learning"),
|
||||
(8, "Big Data"),
|
||||
(8, "artificial intelligence"),
|
||||
(9, "Hadoop"),
|
||||
(9, "Java"),
|
||||
(9, "MapReduce"),
|
||||
(9, "Big Data"),
|
||||
]
|
||||
|
||||
|
||||
def add_friendships(
|
||||
users: List[Dict], friendships: List[Tuple[int, int]]
|
||||
) -> List[Dict]:
|
||||
for user in users:
|
||||
user["friends"] = []
|
||||
|
||||
for i, j in friendships:
|
||||
users[i]["friends"].append(users[j]) # add j as friend of i
|
||||
users[j]["friends"].append(users[i]) # add j as friend of i
|
||||
|
||||
return users
|
||||
|
||||
|
||||
def number_of_friends(user: Dict) -> int:
|
||||
return len(user["friends"])
|
||||
|
||||
|
||||
def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]:
|
||||
"""Returns a sorted list by number of friends"""
|
||||
number_of_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
|
||||
|
||||
_sorted_list = sorted(number_of_friends_by_id, key=lambda tup: tup[1], reverse=True)
|
||||
return _sorted_list
|
||||
|
||||
|
||||
def friend_of_friend_ids_bad(user: Dict[str, Any]) -> List[int]:
|
||||
return [foaf["id"] for friend in user["friends"] for foaf in friend["friends"]]
|
||||
|
||||
|
||||
def find_friend_of_friends(user: Dict) -> List[int]:
|
||||
"""Returns the unique list of ids for a user's friends of a friend."""
|
||||
foaf = {
|
||||
foaf.get("id")
|
||||
for friend in user.get("friends")
|
||||
for foaf in friend.get("friends")
|
||||
}
|
||||
|
||||
difference_set = {friend.get("id") for friend in user.get("friends")}
|
||||
difference_set.add(user.get("id"))
|
||||
|
||||
return list(foaf.difference(difference_set))
|
||||
|
||||
|
||||
def not_the_same(user: Dict, other_user: Dict) -> bool:
|
||||
"""tests for user equivalence.
|
||||
|
||||
If this was OOP this would have used __eq__
|
||||
"""
|
||||
return user.get("id") != other_user.get("id")
|
||||
|
||||
|
||||
def not_friends(user: Dict, other_user: Dict) -> bool:
|
||||
"""Tests for active friendship."""
|
||||
return all(not_the_same(friend, other_user) for friend in user.get("friends"))
|
||||
|
||||
|
||||
def friends_of_friends_ids(user: Dict[str, Any]) -> Counter:
|
||||
return Counter(
|
||||
foaf.get("id")
|
||||
for friend in user.get("friends")
|
||||
for foaf in friend.get("friends")
|
||||
if not_the_same(user, foaf) and not_friends(user, foaf)
|
||||
)
|
||||
|
||||
|
||||
def data_scientist_who_like(target_interst: str) -> List[int]:
|
||||
"""returns a list of DS ids if they have a particular target_interest.
|
||||
|
||||
This is not a very efficient function since it always has to search through.
|
||||
The entire list of interests.
|
||||
"""
|
||||
return [user_id for user_id, interest in INTERETS if target_interst == interest]
|
||||
|
||||
|
||||
def build_interest_to_user_index(
|
||||
interests: List[Tuple[int, "str"]]
|
||||
) -> Dict[str, List[int]]:
|
||||
user_ids_by_interest = defaultdict(list)
|
||||
|
||||
for user_id, interest in interests:
|
||||
user_ids_by_interest[interest].append(user_id)
|
||||
|
||||
return user_ids_by_interest
|
||||
|
||||
|
||||
def build_user_to_interest_index(
|
||||
interests: List[Tuple[int, "str"]]
|
||||
) -> Dict[int, List[str]]:
|
||||
interests_by_user_id = defaultdict(list)
|
||||
|
||||
for user_id, interest in interests:
|
||||
interests_by_user_id[user_id].append(interest)
|
||||
|
||||
return interests_by_user_id
|
||||
|
||||
|
||||
def most_common_interests_with(user, user_index, interest_index):
|
||||
return Counter(
|
||||
interested_user_id
|
||||
for interest in user_index.get(user.get("id"))
|
||||
for interested_user_id in interest_index.get(interest)
|
||||
if user.get("id") != interested_user_id
|
||||
)
|
||||
|
||||
|
||||
# Write a function that will produce a count of users expressing an interest
|
||||
|
||||
|
||||
def build_interest_counter(interests: List[Tuple[int, str]]) -> Counter:
|
||||
return Counter(
|
||||
word for user_id, interest in interests for word in interest.lower().split()
|
||||
)
|
||||
|
||||
|
||||
def print_counter(counter: Counter):
|
||||
for word, count in counter.items():
|
||||
print(f"{word}: {count}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
users = add_friendships(USERS, FRIENDSHIPS)
|
||||
total_connections = sum(number_of_friends(user) for user in users)
|
||||
num_users = len(users)
|
||||
avg_connections = total_connections / num_users
|
||||
counts = friend_counts(users)
|
||||
heros_suggested_friends = friend_of_friend_ids_bad(users[0])
|
||||
better_friend_suggestion = find_friend_of_friends(users[0])
|
||||
|
||||
chis_friend_suggestions_with_counts = friends_of_friends_ids(users[3])
|
||||
|
||||
print(f"Total user count: {num_users}")
|
||||
print(f"Total number of connections: {total_connections}")
|
||||
print(f"Average connections: {avg_connections}")
|
||||
print(counts)
|
||||
print(heros_suggested_friends)
|
||||
print(better_friend_suggestion)
|
||||
print(f"Chi's friend suggestions: {chis_friend_suggestions_with_counts}")
|
||||
|
||||
print("-" * 100)
|
||||
user_idx = build_user_to_interest_index(INTERETS)
|
||||
interest_idx = build_interest_to_user_index(INTERETS)
|
||||
|
||||
chis_common_interests = most_common_interests_with(users[3], user_idx, interest_idx)
|
||||
print("Chi's most users with greatest interest overlap:")
|
||||
print(chis_common_interests)
|
||||
|
||||
print("-" * 100)
|
||||
interest_counter = build_interest_counter(INTERETS)
|
||||
print_counter(interest_counter)
|
@ -0,0 +1,28 @@
|
||||
"""
|
||||
Created just to test the id of OOP for User representation. Kind of heavy on the memory usage side though.
|
||||
"""
|
||||
|
||||
|
||||
from typing import List
|
||||
|
||||
|
||||
class User:
|
||||
def __init__(self, id: int, name: str, freinds: List[int] = []) -> None:
|
||||
self.id = id
|
||||
self.name = name
|
||||
self.friends = freinds
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, User):
|
||||
return self.id == other.id
|
||||
else:
|
||||
return False
|
||||
|
||||
def __repr__(self):
|
||||
return f"User({self.id}, '{self.name}')"
|
||||
|
||||
|
||||
class SuperUser(User):
|
||||
def __init__(self, id: int, name: str, friends: List[int] = []):
|
||||
super(SuperUser, self).__init__(id, name, friends)
|
||||
self.admin = True
|
@ -0,0 +1,18 @@
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
movies = ["Annie Hall", "Ben-hur", "Casablanca", "Gandhi", "West Side Story"]
|
||||
|
||||
num_oscars = [5, 11, 3, 8, 10]
|
||||
|
||||
# Bars are by default 0.8 width. We are adding 0.1 to the left coordinates so that all bars are centered
|
||||
xs = [i + 0.1 for i, _ in enumerate(movies)]
|
||||
|
||||
# Plot the bars with left-x coordinates and heights (num_oscars)
|
||||
|
||||
plt.bar(xs, num_oscars)
|
||||
plt.ylabel("# of Oscars")
|
||||
plt.title("Movies")
|
||||
|
||||
# label x-axis with movie names at bar centers
|
||||
plt.xticks([i + 0.1 for i, _ in enumerate(movies)], movies)
|
||||
plt.show()
|
@ -0,0 +1,11 @@
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
years = range(1950, 2020, 10)
|
||||
gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3]
|
||||
|
||||
plt.plot(years, gdp, marker="o", color="green", linestyle="solid")
|
||||
|
||||
plt.title("Nominal GDP")
|
||||
plt.ylabel("Billions of $")
|
||||
|
||||
plt.show()
|
@ -0,0 +1,2 @@
|
||||
matplotlib==3.0.2
|
||||
black
|
Loading…
Reference in New Issue