## Data source: https://insideairbnb.com/get-the-data/

import os
#Checking for current directory and switching to project folder
current_directory = os.getcwd()
print(current_directory)
os.chdir('/Users/smarr/Documents/Data Sets/AirBNBAnalysis')

/Users/smarr/Documents/Data Sets/Churn Analysis

# Verifying Directory was switched
os.getcwd()

'/Users/smarr/Documents/Data Sets/AirBNBAnalysis'

import pandas as pd

## Importing listings data and cleaning the price column to ensure successful numerical analysis
listings = pd.read_csv('../AirBNBAnalysis/data/listings_clean.csv')
listings['price']= listings['price'].replace('[\$,]', '', regex=True).astype(float)
listings.to_csv('listings_clean.csv', index=False)

<>:3: SyntaxWarning: invalid escape sequence '\$'
<>:3: SyntaxWarning: invalid escape sequence '\$'
/var/folders/yw/pqk5wsp91vlcbh2lpn3gp63w0000gn/T/ipykernel_99117/818574451.py:3: SyntaxWarning: invalid escape sequence '\$'
  listings['price']= listings['price'].replace('[\$,]', '', regex=True).astype(float)

# Testing
listings.head()

# Importing SQL python package to work with the SQL database, establishing connection between python and SQL
from sqlalchemy import create_engine
engine = create_engine('postgresql://smarr@localhost:5432/airbnb_analysis', echo=False)
conn = engine.connect()

#Submitting our listings dataframe to SQL
listings.to_sql('listings', engine, if_exists='replace', index=False)

264

## We are going to partition the ensuing calendar dataset since it is extremely large and jupyter cannot feasibly process it all

chunksize = 500000
sample_size = 50000
samples = []

for chunk in pd.read_csv('../AirBNBAnalysis/data/calendar_sampled.csv', chunksize=chunksize):
    # Random sample 2% from each chunk (adjust if needed)
    sample = chunk.sample(frac=0.02, random_state=42)
    samples.append(sample)
    
    # Stop once we hit the target
    if sum(len(s) for s in samples) >= sample_size:
        break

df_sampled = pd.concat(samples).head(sample_size)

## Saving it to a smaller csv
df_sampled.to_csv('../AirBNBAnalysis/data/calendar_sampled.csv', index=False)

df_sampled['price'] = df_sampled['price'].replace(r'[\$,]', '', regex=True).astype(float)

df_sampled.to_sql('calendar', engine, if_exists='replace', index=False)

1000

# Testing data frame for adjusted price column
df_sampled.head()

## call engine.dispose() or ensure .close() is used on connections to prevent table lock in postgresql

df = pd.read_sql_query("""
    SELECT 
        TO_CHAR(c.date::DATE, 'YYYY-MM') AS month,
        l.neighbourhood_cleansed,
        ROUND(AVG(c.price)::numeric, 2) AS avg_price
    FROM calendar c
    JOIN listings l ON c.listing_id = l.id
    WHERE c.available = 't'
    GROUP BY TO_CHAR(c.date::DATE, 'YYYY-MM'), l.neighbourhood_cleansed
""", conn)

pivot_df = df.pivot(index='neighbourhood_cleansed', columns='month', values='avg_price')
pivot_df.head()

pivot_df_cleaned = pivot_df.dropna()
pivot_df_cleaned
## Some color would look great on this

styled_df=pivot_df_cleaned.style.background_gradient(
    cmap='RdYlGn_r',  # Red for high, green for low
    axis=1            # Apply column-wise
)
styled_df

dfReviews = pd.read_csv("../AirBNBAnalysis/data/reviews.csv", low_memory=False)

dfReviews.to_sql('reviews', engine, if_exists='replace', index=False)

919

dfReviews.head()

## Drop Null values as they will make it more difficult to do sentiment analysis
dfReviews=dfReviews.dropna(subset=['comments'])

## Checking length of Dataframe for verification purposes
len(dfReviews)

970678

## Make all comments lower-case for standardizing purposes
dfReviews['comments']=dfReviews['comments'].str.lower()

from textblob import TextBlob

## Define Function
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

## Apply it to Airbnb comments
dfReviews['Sentiment']=dfReviews['comments'].apply(get_sentiment)

dfReviews[['listing_id','Sentiment']].head()
## Doesn't quite get us our average sentiment by listing

sentimentAvgandCount_by_listing=dfReviews.groupby('listing_id').agg(avg_sentiment=('Sentiment','mean'),sentiment_count=('Sentiment','count')).reset_index()
sentimentAvgandCount_by_listing

## Merge with Listings data
CombinedDF=listings.merge(sentimentAvgandCount_by_listing,left_on='id',right_on='listing_id')

CombinedDF

sentimentAvgandCount_by_listing.sort_values(by='avg_sentiment', ascending=False).head(10)
## Want to get listings that have a sentiment count greater than 10, to get a substantive analysis

sentimentAvgandCount_by_listing = sentimentAvgandCount_by_listing[sentimentAvgandCount_by_listing['sentiment_count']>10]

sentimentAvgandCount_by_listing.sort_values(by='avg_sentiment', ascending=False).head(10)
## Listing ID is pretty abstract. I want to see the URl and the neighbourhood that each listing is in.

sentimentAvgandCount_by_listingNew=sentimentAvgandCount_by_listing.merge(listings[['neighbourhood_cleansed', 'listing_url','id']], left_on = 'listing_id', right_on='id')

## Now lets run the group by and get a better idea of the area and general quality of the listing
sentimentAvgandCount_by_listingNew.sort_values(by='avg_sentiment', ascending=False).head(10).drop(columns=['id'], errors='ignore')

## Taking a look at the above listing url's will quickly demonstrate why they each score farily high on the average sentimeent column.

CombinedDF_NeighbourhoodGroupby=dfReviews.merge(listings[['id','neighbourhood_cleansed']], left_on = 'listing_id', right_on='id')

CombinedDF_NeighbourhoodGroupby

## Drop id_x and y values
CombinedDF_NeighbourhoodGroupby = CombinedDF_NeighbourhoodGroupby.drop(columns=['id_x', 'id_y'], errors='ignore')

CombinedDF_NeighbourhoodGroupby

sentimentAvgandCount_by_neighbourhood=CombinedDF_NeighbourhoodGroupby.groupby('neighbourhood_cleansed').agg(avg_sentiment=('Sentiment','mean'),sentiment_count=('Sentiment','count')).reset_index()

sentimentAvgandCount_by_neighbourhood.sort_values(by='avg_sentiment', ascending=False).head(10)

# For a more reliable analysis, lets include only the neighbourhood scontaining more than 50 sentiment_count entries
sentimentAvgandCount_by_neighbourhood = sentimentAvgandCount_by_neighbourhood[sentimentAvgandCount_by_neighbourhood['sentiment_count']>50]

sentimentAvgandCount_by_neighbourhood.sort_values(by='avg_sentiment', ascending=False).head(10)
## Not surprisignly, we see neighbourhoods like Vinegar Hill, Financial District, and Rockaway beach score relatively highly. 
## I'm sure Tottenville is a great place too - I've yet to see Staten Island

	id	listing_url	scrape_id	last_scraped	source	name	description	neighborhood_overview	picture_url	host_id	...	review_scores_communication	review_scores_location	review_scores_value	license	instant_bookable	calculated_host_listings_count	calculated_host_listings_count_entire_homes	calculated_host_listings_count_private_rooms	reviews_per_month
0	36121	https://www.airbnb.com/rooms/36121	20250301213336	2025-03-03	city scrape	Lg Rm in Historic Prospect Heights	Cozy space share in the heart of a great neigh...	Full of tree-lined streets and beautiful brown...	https://a0.muscache.com/pictures/8776354/35b32...	62165	...	5.00	5.00	5.00	NaN	f	1	0	1	0.05
1	36647	https://www.airbnb.com/rooms/36647	20250301213336	2025-03-03	city scrape	1 Bedroom & your own Bathroom, Elevator Apartment	Private bedroom with your own bathroom in a 2 ...	Manhattan, SE corner of 2nd Ave/ E. 110th street	https://a0.muscache.com/pictures/miso/Hosting-...	157798	...	4.90	4.38	4.71	NaN	f	1	0	1	0.58
2	38663	https://www.airbnb.com/rooms/38663	20250301213336	2025-03-02	city scrape	Luxury Brownstone in Boerum Hill	Beautiful, large home in great hipster neighbo...	diverse, lively, hip, cool: loaded with restau...	https://a0.muscache.com/pictures/miso/Hosting-...	165789	...	4.88	4.86	4.62	OSE-STRREG-0001784	f	1	0	1	0.28
3	38833	https://www.airbnb.com/rooms/38833	20250301213336	2025-03-03	city scrape	Spectacular West Harlem Garden Apt	This is a very large and unique space. An inc...	West Harlem is now packed with great restauran...	https://a0.muscache.com/pictures/7554f9e5-4ab0...	166532	...	4.96	4.79	4.82	OSE-STRREG-0000476	f	1	1	0	1.36
4	39282	https://www.airbnb.com/rooms/39282	20250301213336	2025-03-02	city scrape	“Work-from-home” from OUR home.	*Monthly Discount will automatically apply <br...	THE NEIGHBORHOOD:<br />Our apartment is locate...	https://a0.muscache.com/pictures/ef8f43ad-d967...	168525	...	4.88	4.85	4.78	OSE-STRREG-0001150	f	2	0	2	1.54

	listing_id	date	available	price	adjusted_price	minimum_nights	maximum_nights
33553	9116831	2025-03-22	f	165.0	NaN	30	1125
9427	763600	2025-06-19	f	75.0	NaN	30	1125
199	1989731	2025-03-14	f	137.0	NaN	30	1125
12447	2620837	2025-12-11	f	52.0	NaN	30	62
39489	10369038	2025-07-21	f	50.0	NaN	30	365

month	2025-03	2025-04	2025-05	2025-06	2025-07	2025-08	2025-09	2025-10	2025-11	2025-12	2026-01	2026-02
neighbourhood_cleansed
Arverne	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	299.0	NaN	NaN	NaN
Astoria	NaN	NaN	NaN	NaN	180.0	NaN	NaN	95.00	NaN	48.0	NaN	NaN
Bedford-Stuyvesant	69.5	171.0	NaN	200.0	221.0	192.5	225.0	216.33	55.0	134.5	139.67	NaN
Boerum Hill	199.0	NaN	175.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Borough Park	NaN	NaN	300.0	NaN	NaN	NaN	NaN	61.00	NaN	NaN	NaN	NaN

	listing_id	id	date	reviewer_id	reviewer_name	comments
0	2595	17857	2009-11-21	50679	Jean	Notre séjour de trois nuits.\r<br/>Nous avons ...
1	2595	19176	2009-12-05	53267	Cate	Great experience.
2	2595	19760	2009-12-10	38960	Anita	I've stayed with my friend at the Midtown Cast...
3	2595	34320	2010-04-09	71130	Kai-Uwe	We've been staying here for about 9 nights, en...
4	2595	46312	2010-05-25	117113	Alicia	We had a wonderful stay at Jennifer's charming...

	listing_id	Sentiment
0	2595	0.000000
1	2595	0.800000
2	2595	0.410124
3	2595	0.266875
4	2595	0.518750

New York AirBNB Exploratory and Sentiment Analysis¶

Importing Packages and preparing data¶

Pivot table for understanding price change over time across neighbourhoods¶

Adding reviews csv file for sentiment analysis (Python and PostgreSQL)¶

Sentiment Scoring¶

Average sentiment and count by listing ID¶

Average sentiment and count by neighbourhood (Additional Merge is needed)¶

	listing_id	avg_sentiment	sentiment_count
0	2595	0.333892	49
1	6848	0.386196	194
2	6872	0.559630	1
3	6990	0.392311	251
4	7064	0.476511	13
...	...	...	...
25642	1343696908645827246	-0.244413	1
25643	1344203989320440460	0.246717	1
25644	1344298115530569589	0.392778	2
25645	1348032300167221882	0.258333	2
25646	1348081441771732943	0.191255	2

	listing_id	avg_sentiment	sentiment_count
12823	42757120	1.0	1
6714	18996738	1.0	1
15318	51485535	1.0	1
21005	848890927389228890	1.0	1
25274	1242992636408069849	1.0	1
25490	1279026320984012410	1.0	1
25278	1243527377526506606	1.0	1
3939	9545498	1.0	1
15295	51421349	1.0	2
15289	51402678	1.0	1

	listing_id	avg_sentiment	sentiment_count
14220	48257754	0.658674	11
10103	31216852	0.655640	15
13262	44651142	0.653958	12
17759	638404496657149559	0.646865	14
4239	10089727	0.645863	21
14209	48251757	0.642101	19
11205	36224443	0.640738	11
15573	52169805	0.638703	11
10658	34071447	0.629037	14
9189	27955820	0.628515	14

	listing_id	avg_sentiment	sentiment_count	neighbourhood_cleansed	listing_url
7776	48257754	0.658674	11	Financial District	https://www.airbnb.com/rooms/48257754
5636	31216852	0.655640	15	Chelsea	https://www.airbnb.com/rooms/31216852
7277	44651142	0.653958	12	Chelsea	https://www.airbnb.com/rooms/44651142
9774	638404496657149559	0.646865	14	Cambria Heights	https://www.airbnb.com/rooms/638404496657149559
2433	10089727	0.645863	21	Midtown	https://www.airbnb.com/rooms/10089727
7769	48251757	0.642101	19	Financial District	https://www.airbnb.com/rooms/48251757
6283	36224443	0.640738	11	Theater District	https://www.airbnb.com/rooms/36224443
8526	52169805	0.638703	11	Chelsea	https://www.airbnb.com/rooms/52169805
5980	34071447	0.629037	14	SoHo	https://www.airbnb.com/rooms/34071447
5157	27955820	0.628515	14	Hell's Kitchen	https://www.airbnb.com/rooms/27955820

	neighbourhood_cleansed	avg_sentiment	sentiment_count
218	Woodrow	0.671979	4
9	Bay Terrace, Staten Island	0.639286	1
140	Navy Yard	0.500429	9
52	Country Club	0.481688	29
194	Tottenville	0.474173	213
35	Chelsea, Staten Island	0.472478	2
21	Breezy Point	0.463310	116
203	Vinegar Hill	0.462069	768
149	Oakwood	0.458333	2
14	Belle Harbor	0.458162	245