fork download
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.ensemble import IsolationForest
  4. from sklearn.preprocessing import StandardScaler
  5. import matplotlib.pyplot as plt
  6.  
  7. # Sample stock market transactions data
  8. # Let's assume the dataset has features like transaction ID, stock symbol, price, volume, and time
  9. data = {
  10. 'TransactionID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
  11. 'Price': [100, 105, 200, 150, 110, 107, 130, 155, 5000, 108],
  12. 'Volume': [1000, 1100, 1200, 1000, 1300, 1100, 1400, 1500, 12000, 1050],
  13. 'Time': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # Simple time variable for illustration
  14. }
  15.  
  16. # Convert to a DataFrame
  17. df = pd.DataFrame(data)
  18.  
  19. # Feature selection: Using Price and Volume to detect fraud
  20. features = df[['Price', 'Volume']]
  21.  
  22. # Standardizing the data (important for anomaly detection)
  23. scaler = StandardScaler()
  24. scaled_features = scaler.fit_transform(features)
  25.  
  26. # Using Isolation Forest for anomaly detection
  27. model = IsolationForest(contamination=0.2) # 20% of the transactions could be fraudulent
  28. model.fit(scaled_features)
  29.  
  30. # Predict anomalies (1 is normal, -1 is anomalous)
  31. df['Anomaly'] = model.predict(scaled_features)
  32.  
  33. # Displaying results
  34. print("Transaction Data with Anomaly Detection:")
  35. print(df)
  36.  
  37. # Visualizing the result
  38. plt.figure(figsize=(10, 6))
  39. plt.scatter(df['Price'], df['Volume'], c=df['Anomaly'], cmap='coolwarm', s=100, edgecolors='black')
  40. plt.title('Stock Market Transactions with Fraud Detection')
  41. plt.xlabel('Price')
  42. plt.ylabel('Volume')
  43. plt.colorbar(label='Anomaly')
  44. plt.show()
Success #stdin #stdout #stderr 1.38s 113840KB
stdin
Standard input is empty
stdout
Transaction Data with Anomaly Detection:
   Price  Time  TransactionID  Volume  Anomaly
0    100     1              1    1000        1
1    105     2              2    1100        1
2    200     3              3    1200       -1
3    150     4              4    1000        1
4    110     5              5    1300        1
5    107     6              6    1100        1
6    130     7              7    1400        1
7    155     8              8    1500        1
8   5000     9              9   12000       -1
9    108    10             10    1050        1
stderr
/usr/local/lib/python2.7/dist-packages/sklearn/preprocessing/data.py:645: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
/usr/local/lib/python2.7/dist-packages/sklearn/base.py:464: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
/usr/local/lib/python2.7/dist-packages/sklearn/ensemble/iforest.py:223: FutureWarning: behaviour="old" is deprecated and will be removed in version 0.22. Please use behaviour="new", which makes the decision_function change to match other anomaly detection algorithm API.
  FutureWarning)
/usr/local/lib/python2.7/dist-packages/sklearn/ensemble/iforest.py:417: DeprecationWarning: threshold_ attribute is deprecated in 0.20 and will be removed in 0.22.
  " be removed in 0.22.", DeprecationWarning)