Wine Reviews

Analyzing Trends in Taste, Price, and Quality

Vadym Musiienko

2024-11-29

Data frame information

Columns

[1] "...1"        "country"     "description" "designation" "points"     
[6] "price"       "province"    "variety"    

Rows

Rows: 137,230
Columns: 8
$ ...1        <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
$ country     <fct> US, Spain, US, US, France, Spain, Spain, Spain, US, US, It…
$ description <chr> "This tremendous 100% varietal wine hails from Oakville an…
$ designation <chr> "Martha's Vineyard", "Carodorum Selección Especial Reserva…
$ points      <dbl> 96, 96, 96, 96, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95…
$ price       <dbl> 235, 110, 90, 65, 66, 73, 65, 110, 65, 60, 80, 48, 48, 90,…
$ province    <chr> "California", "Northern Spain", "California", "Oregon", "P…
$ variety     <chr> "Cabernet Sauvignon", "Tinta de Toro", "Sauvignon Blanc", …

Average price of wine by country

The countries that produce the most expensive wines are the US, France, England, Hungary, and Luxembourg, with average prices ranging from 40 to 50 USD. On the other hand, the cheapest wine producers are Montenegro, Lithuania, Bulgaria, Bosnia, and Ukraine, where wines typically cost between 10 and 15 USD.

Code

wine_data |> 
  group_by(country) |>
  summarize(avr_price = mean(price)) |>
  ggplot(aes(fct_reorder(country, avr_price, .desc = TRUE), avr_price,
             fill = country)) +
  geom_col(color = "black", width = 0.8, show.legend = FALSE) +
  labs(
    title = "Average price of wine by country",
    x = NULL,
    y = "Price in USD"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Correlation between Price and Quality

There is definitely a correlation between price and rating, with more expensive wines tending to have higher ratings. However, the relationship is stronger in the 0 to 100 USD range, where price significantly impacts quality. Beyond 100 USD, wines are fairly similar in quality, with higher prices offering only a slight improvement.

Code

wine_data |>
  filter(price < 500) |>
  group_by(price) |>
  summarize(avr_rating = mean(points)) |>
  ggplot(aes(price, avr_rating)) +
  geom_point(alpha = 0.5) +
  geom_smooth(color = "red") +
  labs(
    title = "Price vs. Quality",
    x = "Price (USD)",
    y = "Average rating (Points)"
  ) +
  theme_minimal()

Frequency of wine adjectives

Code

# Create a vector with the adjectives
adjectives <- c("Acidic", "Astringent", "Barnyard", "Buttery", "Chewy",
                "Earthy", "Flabby", "Fruity", "Fruit-forward",
                "Herbaceous", "Jammy", "Juicy", "Musty", "Nutty",
                "Oaky", "Opulent", "Perfumed", "Racy", "Spicy",
                "Supple", "Tannic", "Toasty", "Vegetal", "Velvety")

# Create a regex pattern to search for adjectives
pattern <- paste0("(?i)", paste(adjectives, collapse = "|"))

# Create a data frame with wine reviews where those adjectives were used
wine_with_adj <- wine_data |> 
  mutate(
    adjective = factor(tolower(str_extract(description, pattern)))
    ) |>
  filter(!is.na(adjective))

# Plot the frequency of each adjective
wine_with_adj |> group_by(adjective) |>
  summarise(count = n()) |>
  ggplot(aes(count, fct_reorder(adjective, count, .desc = TRUE), 
             fill = adjective)) +
  geom_col(color = "black", width = 0.8, show.legend = FALSE) +
  labs(
    title = "Adjective Frequency in Wine Reviews",
    y = NULL,
    x = "Number of times mentioned"
  ) +
  theme_minimal()

Average Price and Rating of Wine by Flavor Note

Code

wine_with_adj |> group_by(adjective) |>
  summarise(avr_price = mean(price), avr_points = mean(points)) |>
  ggplot(aes(avr_price, avr_points)) +
  geom_point(size = 3) +
  geom_label_repel(aes(label = adjective),
                   size = 2.5, color = "black", fill = "lightyellow", 
                   label.size = 0.25, label.padding = 0.25, 
                   box.padding = 0.25) +
  labs(
    title = "Average Price and Rating of Wine by Flavor Note",
    x = "Average Price (USD)",
    y = "Average Points (Points)"
  ) +
  theme_minimal()

Berry Notes

Code

# A data frame with wines that were described with some berry
berries <- wine_data |>
  filter(str_detect(description, "(?i)\\b\\w+erry\\b")) |>
  mutate(berry = tolower(str_extract(description, "(?i)\\b\\w+erry\\b")))

# The 6 most used berries used to describe wine (except for just "berry")
topberries <- berries |> group_by(berry) |>
  summarize(count = n()) |> 
  arrange(desc(count)) |>
  filter(berry != "berry") |>
  slice_head(n = 6)

# Plot the price of wine with different berry notes
berries |> 
  filter(berry %in% topberries$berry) |>
  group_by(berry)|>
  summarize(avr_price = mean(price)) |>
  ggplot(aes(fct_reorder(berry, avr_price, .desc = TRUE), avr_price,
             fill = berry)) +
  geom_col(color = "black", show.legend = FALSE) +
  coord_cartesian(ylim = c(25, 45)) +
  labs(
    title = "Average Price of Wine with Different Berry Notes",
    y = "Average price (USD)",
    x = NULL
  ) +
  geom_text(aes(label = berry), color = "black", 
            position = position_stack(vjust = 0.9),
            size = 3.5,
            alpha = 0.8) +
  theme_minimal() +
  theme(axis.text.x = element_blank())

Thank you for watching!