Seu guia completo para dominar análise de dados com Python
Explore as principais bibliotecas, aprenda dicas valiosas e acelere seus projetos de dados
# Lendo um arquivo .csv
import pandas as pd
df = pd.read_csv('dados.csv')
print(df.head())
# Lendo um arquivo .xlsx (Excel)
df_excel = pd.read_excel('seuarquivo.xlsx')
# Se o seu arquivo tiver mais de uma aba e você quiser ler uma aba específica,
# pode usar o parâmetro 'sheet_name'. Pode ser o nome da aba ou o índice (0 para a primeira aba, 1 para a segunda, etc.).
# df_excel_aba = pd.read_excel('seuarquivo.xlsx', sheet_name='NomeDaMinhaAba')
# ou
# df_excel_aba_indice = pd.read_excel('seuarquivo.xlsx', sheet_name=0)
print(df_excel.head())
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
print(np.mean(arr))
import PyPDF2
import pdfplumber
# Exemplo com PyPDF2 (texto simples)
def extract_text_pypdf2(pdf_path):
text = ""
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text += page.extract_text()
return text
# Exemplo com pdfplumber (texto e tabelas)
def extract_data_pdfplumber(pdf_path):
extracted_text = ""
extracted_tables = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
extracted_text += page.extract_text() + "\n"
tables = page.extract_tables()
if tables:
extracted_tables.extend(tables)
return extracted_text, extracted_tables
pdf_text_pypdf2 = extract_text_pypdf2('documento.pdf')
# text, tables = extract_data_pdfplumber('documento_com_tabelas.pdf')
print("Texto PyPDF2 (trecho):", pdf_text_pypdf2[:200])
import pytesseract
import fitz # PyMuPDF para manipular PDFs
from PIL import Image
import io
# ATENÇÃO: Verifique se o Tesseract OCR está instalado no seu sistema!
# Especifique o caminho se necessário (ex: Windows):
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def extract_text_from_scanned_pdf(pdf_path):
text = ""
pdf_document = fitz.open(pdf_path)
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.pil_tobytes(format="png")))
text += pytesseract.image_to_string(img, lang='por') # 'por' para português
return text
scanned_pdf_content = extract_text_from_scanned_pdf('documento_digitalizado.pdf')
print("Texto OCR (trecho):", scanned_pdf_content[:200])
import polars as pl
df = pl.read_csv('dados.csv')
print(df.head())
from scipy import stats
data = [1, 2, 3, 4, 5]
print(stats.describe(data))
from datetime import datetime
now = datetime.now()
print(now.strftime('%Y-%m-%d'))
import missingno as msno
msno.matrix(df)
msno.bar(df)
import matplotlib.pyplot as plt
plt.plot([1, 2, 3, 4], [1, 4, 2, 3])
plt.show()
import seaborn as sns
sns.histplot(data=df, x='idade')
plt.show()
import plotly.express as px
fig = px.bar(df, x='ano', y='vendas')
fig.show()
import sweetviz as sv
report = sv.analyze(df)
report.show_html()
import dtale
d = dtale.show(df)
d.open_browser()
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
import xgboost as xgb
model = xgb.XGBRegressor()
model.fit(X_train, y_train)
import lightgbm as lgb
model = lgb.LGBMRegressor()
model.fit(X_train, y_train)
import tensorflow as tf
from tensorflow import keras
model = keras.Sequential([keras.layers.Dense(units=1, input_shape=[1])])
model.compile(optimizer='sgd', loss='mean_squared_error')
import torch
import torch.nn as nn
model = nn.Linear(10, 1)
loss_fn = nn.MSELoss()
from tensorflow import keras
model = keras.Sequential([keras.layers.Dense(units=64, activation='relu', input_shape=(X_train.shape[1],)), keras.layers.Dense(units=1)])
model.compile(optimizer='adam', loss='mse')
import category_encoders as ce
encoder = ce.OneHotEncoder()
X_encoded = encoder.fit_transform(X)
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df, min_support=0.1)
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model_svm = svm.SVC(kernel='linear') # Support Vector Classifier
model_svm.fit(X_train, y_train)
print(model_svm.score(X_test, y_test))
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)
print(model_dt.score(X_test, y_test))
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)
print(model_rf.score(X_test, y_test))
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=100, n_features=1, noise=0.5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
print(model_lr.score(X_test, y_test))
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
X, y = make_blobs(n_samples=300, centers=4, random_state=42)
kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
kmeans.fit(X)
labels = kmeans.labels_
# plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')
# plt.show()
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model_logreg = LogisticRegression(max_iter=1000, random_state=42)
model_logreg.fit(X_train, y_train)
print(f"Logistic Regression Accuracy: {model_logreg.score(X_test, y_test)}")
from efficient_apriori import apriori
itemsets, rules = apriori(transactions, min_support=0.1)
from sqlalchemy import create_engine
engine = create_engine('postgresql://user:pass@localhost/db')
df.to_sql('tabela', engine)
import psycopg2
conn = psycopg2.connect("host=localhost dbname=test")
cur = conn.cursor()
import pyodbc
conn = pyodbc.connect('DRIVER={SQL Server};SERVER=localhost;DATABASE=test')
cursor = conn.cursor()
import openpyxl
wb = openpyxl.load_workbook('arquivo.xlsx')
ws = wb.active
import dask.dataframe as dd
df = dd.read_csv('arquivo_grande.csv')
resultado = df.groupby('coluna').sum().compute()
import chardet
with open('arquivo.txt', 'rb') as f:
resultado = chardet.detect(f.read())
%load_ext sql
%sql postgresql://user:pass@localhost/db
%%sql
SELECT * FROM tabela LIMIT 10
import pandas_datareader as pdr
data = pdr.get_data_yahoo('AAPL', start='2020-01-01')
import nltk
tokens = nltk.word_tokenize("Olá mundo!")
tagged = nltk.pos_tag(tokens)
import spacy
nlp = spacy.load("pt_core_news_sm")
doc = nlp("Olá mundo!")
from textblob import TextBlob
blob = TextBlob("Bom dia!")
print(blob.sentiment)
import requests
response = requests.get('https://api.exemplo.com/dados')
data = response.json()
import openai
response = openai.Completion.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Olá!"}]
)
import google.generativeai as genai
genai.configure(api_key="sua_api_key")
model = genai.GenerativeModel('gemini-pro')
response = model.generate_content("Olá!")
%load_ext watermark
%watermark -v -m -p pandas,numpy -g
df.apply() ao invés de for loops quando possível.
df.dtypes para verificar tipos de dados e pd.to_numeric() com downcast para otimizar o uso de memória.
df.info(), df.describe() e df.head() para entender seus dados antes de começar a análise.
missingno para visualizar padrões de dados ausentes e escolha a estratégia apropriada (remoção, imputação, etc.).
assert statements.
cross_val_score ou KFold para avaliar modelos de forma robusta.
category_encoders para variáveis categóricas.
venv ou conda para isolar dependências dos seus projetos.